yt_dlp/extractor/duboku.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_urlparse
   8 from ..utils import (
   9     clean_html,
  10     extract_attributes,
  11     ExtractorError,
  12     get_elements_by_class,
  13     int_or_none,
  14     js_to_json,
  15     smuggle_url,
  16     unescapeHTML,
  17 )
  18
  19
  20 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  21     """Return the content of the tag with the specified attribute in the passed HTML document"""
  22
  23     if tag is None:
  24         tag = '[a-zA-Z0-9:._-]+'
  25     if attribute is None:
  26         attribute = ''
  27     else:
  28         attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
  29     if value is None:
  30         value = ''
  31     else:
  32         value = re.escape(value) if escape_value else value
  33         value = '=[\'"]?(?P<value>%s)[\'"]?' % value
  34
  35     retlist = []
  36     for m in re.finditer(r'''(?xs)
  37         <(?P<tag>%s)
  38          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  39          %s%s
  40          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  41         \s*>
  42         (?P<content>.*?)
  43         </\1>
  44     ''' % (tag, attribute, value), html):
  45         retlist.append(m)
  46
  47     return retlist
  48
  49
  50 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  51     retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
  52     return retval[0] if retval else None
  53
  54
  55 class DubokuIE(InfoExtractor):
  56     IE_NAME = 'duboku'
  57     IE_DESC = 'www.duboku.co'
  58
  59     _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
  60     _TESTS = [{
  61         'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
  62         'info_dict': {
  63             'id': '1575-1-1',
  64             'ext': 'ts',
  65             'series': '白色月光',
  66             'title': 'contains:白色月光',
  67             'season_number': 1,
  68             'episode_number': 1,
  69         },
  70         'params': {
  71             'skip_download': 'm3u8 download',
  72         },
  73     }, {
  74         'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
  75         'info_dict': {
  76             'id': '1588-1-1',
  77             'ext': 'ts',
  78             'series': '亲爱的自己',
  79             'title': 'contains:预告片',
  80             'season_number': 1,
  81             'episode_number': 1,
  82         },
  83         'params': {
  84             'skip_download': 'm3u8 download',
  85         },
  86     }]
  87
  88     _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
  89
  90     def _real_extract(self, url):
  91         video_id = self._match_id(url)
  92         temp = video_id.split('-')
  93         series_id = temp[0]
  94         season_id = temp[1]
  95         episode_id = temp[2]
  96
  97         webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id
  98         webpage_html = self._download_webpage(webpage_url, video_id)
  99
 100         # extract video url
 101
 102         player_data = self._search_regex(
 103             self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
 104         player_data = self._parse_json(player_data, video_id, js_to_json)
 105
 106         # extract title
 107
 108         temp = get_elements_by_class('title', webpage_html)
 109         series_title = None
 110         title = None
 111         for html in temp:
 112             mobj = re.search(r'<a\s+.*>(.*)</a>', html)
 113             if mobj:
 114                 href = extract_attributes(mobj.group(0)).get('href')
 115                 if href:
 116                     mobj1 = re.search(r'/(\d+)\.html', href)
 117                     if mobj1 and mobj1.group(1) == series_id:
 118                         series_title = clean_html(mobj.group(0))
 119                         series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
 120                         title = clean_html(html)
 121                         title = re.sub(r'[\s\r\n\t]+', ' ', title)
 122                         break
 123
 124         data_url = player_data.get('url')
 125         if not data_url:
 126             raise ExtractorError('Cannot find url in player_data')
 127         data_from = player_data.get('from')
 128
 129         # if it is an embedded iframe, maybe it's an external source
 130         if data_from == 'iframe':
 131             # use _type url_transparent to retain the meaningful details
 132             # of the video.
 133             return {
 134                 '_type': 'url_transparent',
 135                 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}),
 136                 'id': video_id,
 137                 'title': title,
 138                 'series': series_title,
 139                 'season_number': int_or_none(season_id),
 140                 'season_id': season_id,
 141                 'episode_number': int_or_none(episode_id),
 142                 'episode_id': episode_id,
 143             }
 144
 145         formats = self._extract_m3u8_formats(data_url, video_id, 'mp4')
 146
 147         return {
 148             'id': video_id,
 149             'title': title,
 150             'series': series_title,
 151             'season_number': int_or_none(season_id),
 152             'season_id': season_id,
 153             'episode_number': int_or_none(episode_id),
 154             'episode_id': episode_id,
 155             'formats': formats,
 156             'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
 157         }
 158
 159
 160 class DubokuPlaylistIE(InfoExtractor):
 161     IE_NAME = 'duboku:list'
 162     IE_DESC = 'www.duboku.co entire series'
 163
 164     _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
 165     _TESTS = [{
 166         'url': 'https://www.duboku.co/voddetail/1575.html',
 167         'info_dict': {
 168             'id': 'startswith:1575',
 169             'title': '白色月光',
 170         },
 171         'playlist_count': 12,
 172     }, {
 173         'url': 'https://www.duboku.co/voddetail/1554.html',
 174         'info_dict': {
 175             'id': 'startswith:1554',
 176             'title': '以家人之名',
 177         },
 178         'playlist_mincount': 30,
 179     }, {
 180         'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
 181         'info_dict': {
 182             'id': '1554#playlist2',
 183             'title': '以家人之名',
 184         },
 185         'playlist_mincount': 27,
 186     }]
 187
 188     def _real_extract(self, url):
 189         mobj = self._match_valid_url(url)
 190         if mobj is None:
 191             raise ExtractorError('Invalid URL: %s' % url)
 192         series_id = mobj.group('id')
 193         fragment = compat_urlparse.urlparse(url).fragment
 194
 195         webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
 196         webpage_html = self._download_webpage(webpage_url, series_id)
 197
 198         # extract title
 199
 200         title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
 201         title = unescapeHTML(title.group('content')) if title else None
 202         if not title:
 203             title = self._html_search_meta('keywords', webpage_html)
 204         if not title:
 205             title = _get_element_by_tag_and_attrib(webpage_html, 'title')
 206             title = unescapeHTML(title.group('content')) if title else None
 207
 208         # extract playlists
 209
 210         playlists = {}
 211         for div in _get_elements_by_tag_and_attrib(
 212                 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
 213             playlist_id = div.group('value')
 214             playlist = []
 215             for a in _get_elements_by_tag_and_attrib(
 216                     div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
 217                 playlist.append({
 218                     'href': unescapeHTML(a.group('value')),
 219                     'title': unescapeHTML(a.group('content'))
 220                 })
 221             playlists[playlist_id] = playlist
 222
 223         # select the specified playlist if url fragment exists
 224         playlist = None
 225         playlist_id = None
 226         if fragment:
 227             playlist = playlists.get(fragment)
 228             playlist_id = fragment
 229         else:
 230             first = next(iter(playlists.items()), None)
 231             if first:
 232                 (playlist_id, playlist) = first
 233         if not playlist:
 234             raise ExtractorError(
 235                 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
 236
 237         # return url results
 238         return self.playlist_result([
 239             self.url_result(
 240                 compat_urlparse.urljoin('https://www.duboku.co', x['href']),
 241                 ie=DubokuIE.ie_key(), video_title=x.get('title'))
 242             for x in playlist], series_id + '#' + playlist_id, title)