3 from .common
import InfoExtractor
4 from ..compat
import compat_urlparse
17 def _get_elements_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
18 """Return the content of the tag with the specified attribute in the passed HTML document"""
21 tag
= '[a-zA-Z0-9:._-]+'
25 attribute
= r
'\s+(?P<attribute>%s)' % re
.escape(attribute
)
29 value
= re
.escape(value
) if escape_value
else value
30 value
= '=[\'"]?(?P<value>%s)[\'"]?' % value
33 for m
in re
.finditer(r
'''(?xs)
35 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
37 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
41 ''' % (tag
, attribute
, value
), html
):
47 def _get_element_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
48 retval
= _get_elements_by_tag_and_attrib(html
, tag
, attribute
, value
, escape_value
)
49 return retval
[0] if retval
else None
52 class DubokuIE(InfoExtractor
):
54 IE_DESC
= 'www.duboku.co'
56 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
58 'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
63 'title': 'contains:白色月光',
68 'skip_download': 'm3u8 download',
71 'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
76 'title': 'contains:预告片',
81 'skip_download': 'm3u8 download',
85 _PLAYER_DATA_PATTERN
= r
'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
87 def _real_extract(self
, url
):
88 video_id
= self
._match
_id
(url
)
89 temp
= video_id
.split('-')
94 webpage_url
= 'https://www.duboku.co/vodplay/%s.html' % video_id
95 webpage_html
= self
._download
_webpage
(webpage_url
, video_id
)
99 player_data
= self
._search
_regex
(
100 self
._PLAYER
_DATA
_PATTERN
, webpage_html
, 'player_data')
101 player_data
= self
._parse
_json
(player_data
, video_id
, js_to_json
)
105 temp
= get_elements_by_class('title', webpage_html
)
109 mobj
= re
.search(r
'<a\s+.*>(.*)</a>', html
)
111 href
= extract_attributes(mobj
.group(0)).get('href')
113 mobj1
= re
.search(r
'/(\d+)\.html', href
)
114 if mobj1
and mobj1
.group(1) == series_id
:
115 series_title
= clean_html(mobj
.group(0))
116 series_title
= re
.sub(r
'[\s\r\n\t]+', ' ', series_title
)
117 title
= clean_html(html
)
118 title
= re
.sub(r
'[\s\r\n\t]+', ' ', title
)
121 data_url
= player_data
.get('url')
123 raise ExtractorError('Cannot find url in player_data')
124 data_from
= player_data
.get('from')
126 # if it is an embedded iframe, maybe it's an external source
127 if data_from
== 'iframe':
128 # use _type url_transparent to retain the meaningful details
131 '_type': 'url_transparent',
132 'url': smuggle_url(data_url
, {'http_headers': {'Referer': webpage_url}
}),
135 'series': series_title
,
136 'season_number': int_or_none(season_id
),
137 'season_id': season_id
,
138 'episode_number': int_or_none(episode_id
),
139 'episode_id': episode_id
,
142 formats
= self
._extract
_m
3u8_formats
(data_url
, video_id
, 'mp4')
147 'series': series_title
,
148 'season_number': int_or_none(season_id
),
149 'season_id': season_id
,
150 'episode_number': int_or_none(episode_id
),
151 'episode_id': episode_id
,
153 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
157 class DubokuPlaylistIE(InfoExtractor
):
158 IE_NAME
= 'duboku:list'
159 IE_DESC
= 'www.duboku.co entire series'
161 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
163 'url': 'https://www.duboku.co/voddetail/1575.html',
165 'id': 'startswith:1575',
168 'playlist_count': 12,
170 'url': 'https://www.duboku.co/voddetail/1554.html',
172 'id': 'startswith:1554',
175 'playlist_mincount': 30,
177 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
179 'id': '1554#playlist2',
182 'playlist_mincount': 27,
185 def _real_extract(self
, url
):
186 mobj
= self
._match
_valid
_url
(url
)
188 raise ExtractorError('Invalid URL: %s' % url
)
189 series_id
= mobj
.group('id')
190 fragment
= compat_urlparse
.urlparse(url
).fragment
192 webpage_url
= 'https://www.duboku.co/voddetail/%s.html' % series_id
193 webpage_html
= self
._download
_webpage
(webpage_url
, series_id
)
197 title
= _get_element_by_tag_and_attrib(webpage_html
, 'h1', 'class', 'title')
198 title
= unescapeHTML(title
.group('content')) if title
else None
200 title
= self
._html
_search
_meta
('keywords', webpage_html
)
202 title
= _get_element_by_tag_and_attrib(webpage_html
, 'title')
203 title
= unescapeHTML(title
.group('content')) if title
else None
208 for div
in _get_elements_by_tag_and_attrib(
209 webpage_html
, attribute
='id', value
='playlist\\d+', escape_value
=False):
210 playlist_id
= div
.group('value')
212 for a
in _get_elements_by_tag_and_attrib(
213 div
.group('content'), 'a', 'href', value
='[^\'"]+?', escape_value
=False):
215 'href': unescapeHTML(a
.group('value')),
216 'title': unescapeHTML(a
.group('content'))
218 playlists
[playlist_id
] = playlist
220 # select the specified playlist if url fragment exists
224 playlist
= playlists
.get(fragment
)
225 playlist_id
= fragment
227 first
= next(iter(playlists
.items()), None)
229 (playlist_id
, playlist
) = first
231 raise ExtractorError(
232 'Cannot find %s' % fragment
if fragment
else 'Cannot extract playlist')
235 return self
.playlist_result([
237 compat_urlparse
.urljoin('https://www.duboku.co', x
['href']),
238 ie
=DubokuIE
.ie_key(), video_title
=x
.get('title'))
239 for x
in playlist
], series_id
+ '#' + playlist_id
, title
)