yt_dlp/extractor/duboku.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..compat import compat_urlparse
   5 from ..utils import (
   6     clean_html,
   7     extract_attributes,
   8     ExtractorError,
   9     get_elements_by_class,
  10     int_or_none,
  11     js_to_json,
  12     smuggle_url,
  13     unescapeHTML,
  14 )
  15
  16
  17 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  18     """Return the content of the tag with the specified attribute in the passed HTML document"""
  19
  20     if tag is None:
  21         tag = '[a-zA-Z0-9:._-]+'
  22     if attribute is None:
  23         attribute = ''
  24     else:
  25         attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
  26     if value is None:
  27         value = ''
  28     else:
  29         value = re.escape(value) if escape_value else value
  30         value = '=[\'"]?(?P<value>%s)[\'"]?' % value
  31
  32     retlist = []
  33     for m in re.finditer(r'''(?xs)
  34         <(?P<tag>%s)
  35          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  36          %s%s
  37          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  38         \s*>
  39         (?P<content>.*?)
  40         </\1>
  41     ''' % (tag, attribute, value), html):
  42         retlist.append(m)
  43
  44     return retlist
  45
  46
  47 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  48     retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
  49     return retval[0] if retval else None
  50
  51
  52 class DubokuIE(InfoExtractor):
  53     IE_NAME = 'duboku'
  54     IE_DESC = 'www.duboku.io'
  55
  56     _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
  57     _TESTS = [{
  58         'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
  59         'info_dict': {
  60             'id': '1575-1-1',
  61             'ext': 'mp4',
  62             'series': '白色月光',
  63             'title': 'contains:白色月光',
  64             'season_number': 1,
  65             'episode_number': 1,
  66             'season': 'Season 1',
  67             'episode_id': '1',
  68             'season_id': '1',
  69             'episode': 'Episode 1',
  70         },
  71         'params': {
  72             'skip_download': 'm3u8 download',
  73         },
  74     }, {
  75         'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
  76         'info_dict': {
  77             'id': '1588-1-1',
  78             'ext': 'mp4',
  79             'series': '亲爱的自己',
  80             'title': 'contains:第1集',
  81             'season_number': 1,
  82             'episode_number': 1,
  83             'episode': 'Episode 1',
  84             'season': 'Season 1',
  85             'episode_id': '1',
  86             'season_id': '1',
  87         },
  88         'params': {
  89             'skip_download': 'm3u8 download',
  90         },
  91     }]
  92
  93     _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
  94
  95     def _real_extract(self, url):
  96         video_id = self._match_id(url)
  97         temp = video_id.split('-')
  98         series_id = temp[0]
  99         season_id = temp[1]
 100         episode_id = temp[2]
 101
 102         webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
 103         webpage_html = self._download_webpage(webpage_url, video_id)
 104
 105         # extract video url
 106
 107         player_data = self._search_regex(
 108             self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
 109         player_data = self._parse_json(player_data, video_id, js_to_json)
 110
 111         # extract title
 112
 113         temp = get_elements_by_class('title', webpage_html)
 114         series_title = None
 115         title = None
 116         for html in temp:
 117             mobj = re.search(r'<a\s+.*>(.*)</a>', html)
 118             if mobj:
 119                 href = extract_attributes(mobj.group(0)).get('href')
 120                 if href:
 121                     mobj1 = re.search(r'/(\d+)\.html', href)
 122                     if mobj1 and mobj1.group(1) == series_id:
 123                         series_title = clean_html(mobj.group(0))
 124                         series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
 125                         title = clean_html(html)
 126                         title = re.sub(r'[\s\r\n\t]+', ' ', title)
 127                         break
 128
 129         data_url = player_data.get('url')
 130         if not data_url:
 131             raise ExtractorError('Cannot find url in player_data')
 132         data_from = player_data.get('from')
 133
 134         # if it is an embedded iframe, maybe it's an external source
 135         headers = {'Referer': webpage_url}
 136         if data_from == 'iframe':
 137             # use _type url_transparent to retain the meaningful details
 138             # of the video.
 139             return {
 140                 '_type': 'url_transparent',
 141                 'url': smuggle_url(data_url, {'http_headers': headers}),
 142                 'id': video_id,
 143                 'title': title,
 144                 'series': series_title,
 145                 'season_number': int_or_none(season_id),
 146                 'season_id': season_id,
 147                 'episode_number': int_or_none(episode_id),
 148                 'episode_id': episode_id,
 149             }
 150
 151         formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
 152
 153         return {
 154             'id': video_id,
 155             'title': title,
 156             'series': series_title,
 157             'season_number': int_or_none(season_id),
 158             'season_id': season_id,
 159             'episode_number': int_or_none(episode_id),
 160             'episode_id': episode_id,
 161             'formats': formats,
 162             'http_headers': headers
 163         }
 164
 165
 166 class DubokuPlaylistIE(InfoExtractor):
 167     IE_NAME = 'duboku:list'
 168     IE_DESC = 'www.duboku.io entire series'
 169
 170     _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
 171     _TESTS = [{
 172         'url': 'https://w.duboku.io/voddetail/1575.html',
 173         'info_dict': {
 174             'id': 'startswith:1575',
 175             'title': '白色月光',
 176         },
 177         'playlist_count': 12,
 178     }, {
 179         'url': 'https://w.duboku.io/voddetail/1554.html',
 180         'info_dict': {
 181             'id': 'startswith:1554',
 182             'title': '以家人之名',
 183         },
 184         'playlist_mincount': 30,
 185     }]
 186
 187     def _real_extract(self, url):
 188         mobj = self._match_valid_url(url)
 189         if mobj is None:
 190             raise ExtractorError('Invalid URL: %s' % url)
 191         series_id = mobj.group('id')
 192         fragment = compat_urlparse.urlparse(url).fragment
 193
 194         webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
 195         webpage_html = self._download_webpage(webpage_url, series_id)
 196
 197         # extract title
 198
 199         title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
 200         title = unescapeHTML(title.group('content')) if title else None
 201         if not title:
 202             title = self._html_search_meta('keywords', webpage_html)
 203         if not title:
 204             title = _get_element_by_tag_and_attrib(webpage_html, 'title')
 205             title = unescapeHTML(title.group('content')) if title else None
 206
 207         # extract playlists
 208
 209         playlists = {}
 210         for div in _get_elements_by_tag_and_attrib(
 211                 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
 212             playlist_id = div.group('value')
 213             playlist = []
 214             for a in _get_elements_by_tag_and_attrib(
 215                     div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
 216                 playlist.append({
 217                     'href': unescapeHTML(a.group('value')),
 218                     'title': unescapeHTML(a.group('content'))
 219                 })
 220             playlists[playlist_id] = playlist
 221
 222         # select the specified playlist if url fragment exists
 223         playlist = None
 224         playlist_id = None
 225         if fragment:
 226             playlist = playlists.get(fragment)
 227             playlist_id = fragment
 228         else:
 229             first = next(iter(playlists.items()), None)
 230             if first:
 231                 (playlist_id, playlist) = first
 232         if not playlist:
 233             raise ExtractorError(
 234                 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
 235
 236         # return url results
 237         return self.playlist_result([
 238             self.url_result(
 239                 compat_urlparse.urljoin('https://w.duboku.io', x['href']),
 240                 ie=DubokuIE.ie_key(), video_title=x.get('title'))
 241             for x in playlist], series_id + '#' + playlist_id, title)