yt_dlp/extractor/duboku.py

   1 import base64
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..compat import compat_urlparse
   7 from ..utils import (
   8     ExtractorError,
   9     clean_html,
  10     extract_attributes,
  11     get_elements_by_class,
  12     int_or_none,
  13     js_to_json,
  14     smuggle_url,
  15     unescapeHTML,
  16 )
  17
  18
  19 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  20     """Return the content of the tag with the specified attribute in the passed HTML document"""
  21
  22     if tag is None:
  23         tag = '[a-zA-Z0-9:._-]+'
  24     if attribute is None:
  25         attribute = ''
  26     else:
  27         attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
  28     if value is None:
  29         value = ''
  30     else:
  31         value = re.escape(value) if escape_value else value
  32         value = '=[\'"]?(?P<value>%s)[\'"]?' % value
  33
  34     retlist = []
  35     for m in re.finditer(r'''(?xs)
  36         <(?P<tag>%s)
  37          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  38          %s%s
  39          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  40         \s*>
  41         (?P<content>.*?)
  42         </\1>
  43     ''' % (tag, attribute, value), html):
  44         retlist.append(m)
  45
  46     return retlist
  47
  48
  49 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  50     retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
  51     return retval[0] if retval else None
  52
  53
  54 class DubokuIE(InfoExtractor):
  55     IE_NAME = 'duboku'
  56     IE_DESC = 'www.duboku.io'
  57
  58     _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
  59     _TESTS = [{
  60         'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
  61         'info_dict': {
  62             'id': '1575-1-1',
  63             'ext': 'mp4',
  64             'series': '白色月光',
  65             'title': 'contains:白色月光',
  66             'season_number': 1,
  67             'episode_number': 1,
  68             'season': 'Season 1',
  69             'episode_id': '1',
  70             'season_id': '1',
  71             'episode': 'Episode 1',
  72         },
  73         'params': {
  74             'skip_download': 'm3u8 download',
  75         },
  76     }, {
  77         'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
  78         'info_dict': {
  79             'id': '1588-1-1',
  80             'ext': 'mp4',
  81             'series': '亲爱的自己',
  82             'title': 'contains:第1集',
  83             'season_number': 1,
  84             'episode_number': 1,
  85             'episode': 'Episode 1',
  86             'season': 'Season 1',
  87             'episode_id': '1',
  88             'season_id': '1',
  89         },
  90         'params': {
  91             'skip_download': 'm3u8 download',
  92         },
  93     }]
  94
  95     _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
  96
  97     def _real_extract(self, url):
  98         video_id = self._match_id(url)
  99         temp = video_id.split('-')
 100         series_id = temp[0]
 101         season_id = temp[1]
 102         episode_id = temp[2]
 103
 104         webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
 105         webpage_html = self._download_webpage(webpage_url, video_id)
 106
 107         # extract video url
 108
 109         player_data = self._search_regex(
 110             self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
 111         player_data = self._parse_json(player_data, video_id, js_to_json)
 112
 113         # extract title
 114
 115         temp = get_elements_by_class('title', webpage_html)
 116         series_title = None
 117         title = None
 118         for html in temp:
 119             mobj = re.search(r'<a\s+.*>(.*)</a>', html)
 120             if mobj:
 121                 href = extract_attributes(mobj.group(0)).get('href')
 122                 if href:
 123                     mobj1 = re.search(r'/(\d+)\.html', href)
 124                     if mobj1 and mobj1.group(1) == series_id:
 125                         series_title = clean_html(mobj.group(0))
 126                         series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
 127                         title = clean_html(html)
 128                         title = re.sub(r'[\s\r\n\t]+', ' ', title)
 129                         break
 130
 131         data_url = player_data.get('url')
 132         if not data_url:
 133             raise ExtractorError('Cannot find url in player_data')
 134         player_encrypt = player_data.get('encrypt')
 135         if player_encrypt == 1:
 136             data_url = urllib.parse.unquote(data_url)
 137         elif player_encrypt == 2:
 138             data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
 139
 140         # if it is an embedded iframe, maybe it's an external source
 141         headers = {'Referer': webpage_url}
 142         if player_data.get('from') == 'iframe':
 143             # use _type url_transparent to retain the meaningful details
 144             # of the video.
 145             return {
 146                 '_type': 'url_transparent',
 147                 'url': smuggle_url(data_url, {'referer': webpage_url}),
 148                 'id': video_id,
 149                 'title': title,
 150                 'series': series_title,
 151                 'season_number': int_or_none(season_id),
 152                 'season_id': season_id,
 153                 'episode_number': int_or_none(episode_id),
 154                 'episode_id': episode_id,
 155             }
 156
 157         formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
 158
 159         return {
 160             'id': video_id,
 161             'title': title,
 162             'series': series_title,
 163             'season_number': int_or_none(season_id),
 164             'season_id': season_id,
 165             'episode_number': int_or_none(episode_id),
 166             'episode_id': episode_id,
 167             'formats': formats,
 168             'http_headers': headers
 169         }
 170
 171
 172 class DubokuPlaylistIE(InfoExtractor):
 173     IE_NAME = 'duboku:list'
 174     IE_DESC = 'www.duboku.io entire series'
 175
 176     _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
 177     _TESTS = [{
 178         'url': 'https://w.duboku.io/voddetail/1575.html',
 179         'info_dict': {
 180             'id': 'startswith:1575',
 181             'title': '白色月光',
 182         },
 183         'playlist_count': 12,
 184     }, {
 185         'url': 'https://w.duboku.io/voddetail/1554.html',
 186         'info_dict': {
 187             'id': 'startswith:1554',
 188             'title': '以家人之名',
 189         },
 190         'playlist_mincount': 30,
 191     }]
 192
 193     def _real_extract(self, url):
 194         mobj = self._match_valid_url(url)
 195         if mobj is None:
 196             raise ExtractorError('Invalid URL: %s' % url)
 197         series_id = mobj.group('id')
 198         fragment = compat_urlparse.urlparse(url).fragment
 199
 200         webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
 201         webpage_html = self._download_webpage(webpage_url, series_id)
 202
 203         # extract title
 204
 205         title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
 206         title = unescapeHTML(title.group('content')) if title else None
 207         if not title:
 208             title = self._html_search_meta('keywords', webpage_html)
 209         if not title:
 210             title = _get_element_by_tag_and_attrib(webpage_html, 'title')
 211             title = unescapeHTML(title.group('content')) if title else None
 212
 213         # extract playlists
 214
 215         playlists = {}
 216         for div in _get_elements_by_tag_and_attrib(
 217                 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
 218             playlist_id = div.group('value')
 219             playlist = []
 220             for a in _get_elements_by_tag_and_attrib(
 221                     div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
 222                 playlist.append({
 223                     'href': unescapeHTML(a.group('value')),
 224                     'title': unescapeHTML(a.group('content'))
 225                 })
 226             playlists[playlist_id] = playlist
 227
 228         # select the specified playlist if url fragment exists
 229         playlist = None
 230         playlist_id = None
 231         if fragment:
 232             playlist = playlists.get(fragment)
 233             playlist_id = fragment
 234         else:
 235             first = next(iter(playlists.items()), None)
 236             if first:
 237                 (playlist_id, playlist) = first
 238         if not playlist:
 239             raise ExtractorError(
 240                 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
 241
 242         # return url results
 243         return self.playlist_result([
 244             self.url_result(
 245                 compat_urlparse.urljoin('https://w.duboku.io', x['href']),
 246                 ie=DubokuIE.ie_key(), video_title=x.get('title'))
 247             for x in playlist], series_id + '#' + playlist_id, title)