2 from __future__
import unicode_literals
6 from .common
import InfoExtractor
7 from ..compat
import compat_urlparse
12 get_elements_by_class
,
20 def _get_elements_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
21 """Return the content of the tag with the specified attribute in the passed HTML document"""
24 tag
= '[a-zA-Z0-9:._-]+'
28 attribute
= r
'\s+(?P<attribute>%s)' % re
.escape(attribute
)
32 value
= re
.escape(value
) if escape_value
else value
33 value
= '=[\'"]?(?P<value>%s)[\'"]?' % value
36 for m
in re
.finditer(r
'''(?xs)
38 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
40 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
44 ''' % (tag
, attribute
, value
), html
):
50 def _get_element_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
51 retval
= _get_elements_by_tag_and_attrib(html
, tag
, attribute
, value
, escape_value
)
52 return retval
[0] if retval
else None
55 class DubokuIE(InfoExtractor
):
57 IE_DESC
= 'www.duboku.co'
59 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
61 'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
66 'title': 'contains:白色月光',
71 'skip_download': 'm3u8 download',
74 'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
79 'title': 'contains:预告片',
84 'skip_download': 'm3u8 download',
88 _PLAYER_DATA_PATTERN
= r
'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
90 def _real_extract(self
, url
):
91 video_id
= self
._match
_id
(url
)
92 temp
= video_id
.split('-')
97 webpage_url
= 'https://www.duboku.co/vodplay/%s.html' % video_id
98 webpage_html
= self
._download
_webpage
(webpage_url
, video_id
)
102 player_data
= self
._search
_regex
(
103 self
._PLAYER
_DATA
_PATTERN
, webpage_html
, 'player_data')
104 player_data
= self
._parse
_json
(player_data
, video_id
, js_to_json
)
108 temp
= get_elements_by_class('title', webpage_html
)
112 mobj
= re
.search(r
'<a\s+.*>(.*)</a>', html
)
114 href
= extract_attributes(mobj
.group(0)).get('href')
116 mobj1
= re
.search(r
'/(\d+)\.html', href
)
117 if mobj1
and mobj1
.group(1) == series_id
:
118 series_title
= clean_html(mobj
.group(0))
119 series_title
= re
.sub(r
'[\s\r\n\t]+', ' ', series_title
)
120 title
= clean_html(html
)
121 title
= re
.sub(r
'[\s\r\n\t]+', ' ', title
)
124 data_url
= player_data
.get('url')
126 raise ExtractorError('Cannot find url in player_data')
127 data_from
= player_data
.get('from')
129 # if it is an embedded iframe, maybe it's an external source
130 if data_from
== 'iframe':
131 # use _type url_transparent to retain the meaningful details
134 '_type': 'url_transparent',
135 'url': smuggle_url(data_url
, {'http_headers': {'Referer': webpage_url}
}),
138 'series': series_title
,
139 'season_number': int_or_none(season_id
),
140 'season_id': season_id
,
141 'episode_number': int_or_none(episode_id
),
142 'episode_id': episode_id
,
145 formats
= self
._extract
_m
3u8_formats
(data_url
, video_id
, 'mp4')
150 'series': series_title
,
151 'season_number': int_or_none(season_id
),
152 'season_id': season_id
,
153 'episode_number': int_or_none(episode_id
),
154 'episode_id': episode_id
,
156 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
160 class DubokuPlaylistIE(InfoExtractor
):
161 IE_NAME
= 'duboku:list'
162 IE_DESC
= 'www.duboku.co entire series'
164 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
166 'url': 'https://www.duboku.co/voddetail/1575.html',
168 'id': 'startswith:1575',
171 'playlist_count': 12,
173 'url': 'https://www.duboku.co/voddetail/1554.html',
175 'id': 'startswith:1554',
178 'playlist_mincount': 30,
180 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
182 'id': '1554#playlist2',
185 'playlist_mincount': 27,
188 def _real_extract(self
, url
):
189 mobj
= self
._match
_valid
_url
(url
)
191 raise ExtractorError('Invalid URL: %s' % url
)
192 series_id
= mobj
.group('id')
193 fragment
= compat_urlparse
.urlparse(url
).fragment
195 webpage_url
= 'https://www.duboku.co/voddetail/%s.html' % series_id
196 webpage_html
= self
._download
_webpage
(webpage_url
, series_id
)
200 title
= _get_element_by_tag_and_attrib(webpage_html
, 'h1', 'class', 'title')
201 title
= unescapeHTML(title
.group('content')) if title
else None
203 title
= self
._html
_search
_meta
('keywords', webpage_html
)
205 title
= _get_element_by_tag_and_attrib(webpage_html
, 'title')
206 title
= unescapeHTML(title
.group('content')) if title
else None
211 for div
in _get_elements_by_tag_and_attrib(
212 webpage_html
, attribute
='id', value
='playlist\\d+', escape_value
=False):
213 playlist_id
= div
.group('value')
215 for a
in _get_elements_by_tag_and_attrib(
216 div
.group('content'), 'a', 'href', value
='[^\'"]+?', escape_value
=False):
218 'href': unescapeHTML(a
.group('value')),
219 'title': unescapeHTML(a
.group('content'))
221 playlists
[playlist_id
] = playlist
223 # select the specified playlist if url fragment exists
227 playlist
= playlists
.get(fragment
)
228 playlist_id
= fragment
230 first
= next(iter(playlists
.items()), None)
232 (playlist_id
, playlist
) = first
234 raise ExtractorError(
235 'Cannot find %s' % fragment
if fragment
else 'Cannot extract playlist')
238 return self
.playlist_result([
240 compat_urlparse
.urljoin('https://www.duboku.co', x
['href']),
241 ie
=DubokuIE
.ie_key(), video_title
=x
.get('title'))
242 for x
in playlist
], series_id
+ '#' + playlist_id
, title
)