yt_dlp/extractor/duboku.py

   1 import base64
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     clean_html,
   9     extract_attributes,
  10     get_elements_by_class,
  11     int_or_none,
  12     js_to_json,
  13     smuggle_url,
  14     unescapeHTML,
  15 )
  16
  17
  18 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  19     """Return the content of the tag with the specified attribute in the passed HTML document"""
  20
  21     if tag is None:
  22         tag = '[a-zA-Z0-9:._-]+'
  23     if attribute is None:
  24         attribute = ''
  25     else:
  26         attribute = rf'\s+(?P<attribute>{re.escape(attribute)})'
  27     if value is None:
  28         value = ''
  29     else:
  30         value = re.escape(value) if escape_value else value
  31         value = f'=[\'"]?(?P<value>{value})[\'"]?'
  32
  33     retlist = []
  34     for m in re.finditer(rf'''(?xs)
  35         <(?P<tag>{tag})
  36          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  37          {attribute}{value}
  38          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  39         \s*>
  40         (?P<content>.*?)
  41         </\1>
  42     ''', html):
  43         retlist.append(m)
  44
  45     return retlist
  46
  47
  48 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  49     retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
  50     return retval[0] if retval else None
  51
  52
  53 class DubokuIE(InfoExtractor):
  54     IE_NAME = 'duboku'
  55     IE_DESC = 'www.duboku.io'
  56
  57     _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
  58     _TESTS = [{
  59         'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
  60         'info_dict': {
  61             'id': '1575-1-1',
  62             'ext': 'mp4',
  63             'series': '白色月光',
  64             'title': 'contains:白色月光',
  65             'season_number': 1,
  66             'episode_number': 1,
  67             'season': 'Season 1',
  68             'episode_id': '1',
  69             'season_id': '1',
  70             'episode': 'Episode 1',
  71         },
  72         'params': {
  73             'skip_download': 'm3u8 download',
  74         },
  75     }, {
  76         'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
  77         'info_dict': {
  78             'id': '1588-1-1',
  79             'ext': 'mp4',
  80             'series': '亲爱的自己',
  81             'title': 'contains:第1集',
  82             'season_number': 1,
  83             'episode_number': 1,
  84             'episode': 'Episode 1',
  85             'season': 'Season 1',
  86             'episode_id': '1',
  87             'season_id': '1',
  88         },
  89         'params': {
  90             'skip_download': 'm3u8 download',
  91         },
  92     }]
  93
  94     _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
  95
  96     def _real_extract(self, url):
  97         video_id = self._match_id(url)
  98         temp = video_id.split('-')
  99         series_id = temp[0]
 100         season_id = temp[1]
 101         episode_id = temp[2]
 102
 103         webpage_url = f'https://w.duboku.io/vodplay/{video_id}.html'
 104         webpage_html = self._download_webpage(webpage_url, video_id)
 105
 106         # extract video url
 107
 108         player_data = self._search_regex(
 109             self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
 110         player_data = self._parse_json(player_data, video_id, js_to_json)
 111
 112         # extract title
 113
 114         temp = get_elements_by_class('title', webpage_html)
 115         series_title = None
 116         title = None
 117         for html in temp:
 118             mobj = re.search(r'<a\s+.*>(.*)</a>', html)
 119             if mobj:
 120                 href = extract_attributes(mobj.group(0)).get('href')
 121                 if href:
 122                     mobj1 = re.search(r'/(\d+)\.html', href)
 123                     if mobj1 and mobj1.group(1) == series_id:
 124                         series_title = clean_html(mobj.group(0))
 125                         series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
 126                         title = clean_html(html)
 127                         title = re.sub(r'[\s\r\n\t]+', ' ', title)
 128                         break
 129
 130         data_url = player_data.get('url')
 131         if not data_url:
 132             raise ExtractorError('Cannot find url in player_data')
 133         player_encrypt = player_data.get('encrypt')
 134         if player_encrypt == 1:
 135             data_url = urllib.parse.unquote(data_url)
 136         elif player_encrypt == 2:
 137             data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
 138
 139         # if it is an embedded iframe, maybe it's an external source
 140         headers = {'Referer': webpage_url}
 141         if player_data.get('from') == 'iframe':
 142             # use _type url_transparent to retain the meaningful details
 143             # of the video.
 144             return {
 145                 '_type': 'url_transparent',
 146                 'url': smuggle_url(data_url, {'referer': webpage_url}),
 147                 'id': video_id,
 148                 'title': title,
 149                 'series': series_title,
 150                 'season_number': int_or_none(season_id),
 151                 'season_id': season_id,
 152                 'episode_number': int_or_none(episode_id),
 153                 'episode_id': episode_id,
 154             }
 155
 156         formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
 157
 158         return {
 159             'id': video_id,
 160             'title': title,
 161             'series': series_title,
 162             'season_number': int_or_none(season_id),
 163             'season_id': season_id,
 164             'episode_number': int_or_none(episode_id),
 165             'episode_id': episode_id,
 166             'formats': formats,
 167             'http_headers': headers,
 168         }
 169
 170
 171 class DubokuPlaylistIE(InfoExtractor):
 172     IE_NAME = 'duboku:list'
 173     IE_DESC = 'www.duboku.io entire series'
 174
 175     _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
 176     _TESTS = [{
 177         'url': 'https://w.duboku.io/voddetail/1575.html',
 178         'info_dict': {
 179             'id': 'startswith:1575',
 180             'title': '白色月光',
 181         },
 182         'playlist_count': 12,
 183     }, {
 184         'url': 'https://w.duboku.io/voddetail/1554.html',
 185         'info_dict': {
 186             'id': 'startswith:1554',
 187             'title': '以家人之名',
 188         },
 189         'playlist_mincount': 30,
 190     }]
 191
 192     def _real_extract(self, url):
 193         mobj = self._match_valid_url(url)
 194         if mobj is None:
 195             raise ExtractorError(f'Invalid URL: {url}')
 196         series_id = mobj.group('id')
 197         fragment = urllib.parse.urlparse(url).fragment
 198
 199         webpage_url = f'https://w.duboku.io/voddetail/{series_id}.html'
 200         webpage_html = self._download_webpage(webpage_url, series_id)
 201
 202         # extract title
 203
 204         title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
 205         title = unescapeHTML(title.group('content')) if title else None
 206         if not title:
 207             title = self._html_search_meta('keywords', webpage_html)
 208         if not title:
 209             title = _get_element_by_tag_and_attrib(webpage_html, 'title')
 210             title = unescapeHTML(title.group('content')) if title else None
 211
 212         # extract playlists
 213
 214         playlists = {}
 215         for div in _get_elements_by_tag_and_attrib(
 216                 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
 217             playlist_id = div.group('value')
 218             playlist = []
 219             for a in _get_elements_by_tag_and_attrib(
 220                     div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
 221                 playlist.append({
 222                     'href': unescapeHTML(a.group('value')),
 223                     'title': unescapeHTML(a.group('content')),
 224                 })
 225             playlists[playlist_id] = playlist
 226
 227         # select the specified playlist if url fragment exists
 228         playlist = None
 229         playlist_id = None
 230         if fragment:
 231             playlist = playlists.get(fragment)
 232             playlist_id = fragment
 233         else:
 234             first = next(iter(playlists.items()), None)
 235             if first:
 236                 (playlist_id, playlist) = first
 237         if not playlist:
 238             raise ExtractorError(
 239                 f'Cannot find {fragment}' if fragment else 'Cannot extract playlist')
 240
 241         # return url results
 242         return self.playlist_result([
 243             self.url_result(
 244                 urllib.parse.urljoin('https://w.duboku.io', x['href']),
 245                 ie=DubokuIE.ie_key(), video_title=x.get('title'))
 246             for x in playlist], series_id + '#' + playlist_id, title)