3 from .common
import InfoExtractor
4 from ..compat
import compat_urlparse
17 def _get_elements_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
18 """Return the content of the tag with the specified attribute in the passed HTML document"""
21 tag
= '[a-zA-Z0-9:._-]+'
25 attribute
= r
'\s+(?P<attribute>%s)' % re
.escape(attribute
)
29 value
= re
.escape(value
) if escape_value
else value
30 value
= '=[\'"]?(?P<value>%s)[\'"]?' % value
33 for m
in re
.finditer(r
'''(?xs)
35 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
37 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
41 ''' % (tag
, attribute
, value
), html
):
47 def _get_element_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
48 retval
= _get_elements_by_tag_and_attrib(html
, tag
, attribute
, value
, escape_value
)
49 return retval
[0] if retval
else None
52 class DubokuIE(InfoExtractor
):
54 IE_DESC
= 'www.duboku.io'
56 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
58 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
63 'title': 'contains:白色月光',
69 'episode': 'Episode 1',
72 'skip_download': 'm3u8 download',
75 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
80 'title': 'contains:第1集',
83 'episode': 'Episode 1',
89 'skip_download': 'm3u8 download',
93 _PLAYER_DATA_PATTERN
= r
'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
95 def _real_extract(self
, url
):
96 video_id
= self
._match
_id
(url
)
97 temp
= video_id
.split('-')
102 webpage_url
= 'https://w.duboku.io/vodplay/%s.html' % video_id
103 webpage_html
= self
._download
_webpage
(webpage_url
, video_id
)
107 player_data
= self
._search
_regex
(
108 self
._PLAYER
_DATA
_PATTERN
, webpage_html
, 'player_data')
109 player_data
= self
._parse
_json
(player_data
, video_id
, js_to_json
)
113 temp
= get_elements_by_class('title', webpage_html
)
117 mobj
= re
.search(r
'<a\s+.*>(.*)</a>', html
)
119 href
= extract_attributes(mobj
.group(0)).get('href')
121 mobj1
= re
.search(r
'/(\d+)\.html', href
)
122 if mobj1
and mobj1
.group(1) == series_id
:
123 series_title
= clean_html(mobj
.group(0))
124 series_title
= re
.sub(r
'[\s\r\n\t]+', ' ', series_title
)
125 title
= clean_html(html
)
126 title
= re
.sub(r
'[\s\r\n\t]+', ' ', title
)
129 data_url
= player_data
.get('url')
131 raise ExtractorError('Cannot find url in player_data')
132 data_from
= player_data
.get('from')
134 # if it is an embedded iframe, maybe it's an external source
135 headers
= {'Referer': webpage_url}
136 if data_from
== 'iframe':
137 # use _type url_transparent to retain the meaningful details
140 '_type': 'url_transparent',
141 'url': smuggle_url(data_url
, {'http_headers': headers}
),
144 'series': series_title
,
145 'season_number': int_or_none(season_id
),
146 'season_id': season_id
,
147 'episode_number': int_or_none(episode_id
),
148 'episode_id': episode_id
,
151 formats
= self
._extract
_m
3u8_formats
(data_url
, video_id
, 'mp4', headers
=headers
)
156 'series': series_title
,
157 'season_number': int_or_none(season_id
),
158 'season_id': season_id
,
159 'episode_number': int_or_none(episode_id
),
160 'episode_id': episode_id
,
162 'http_headers': headers
166 class DubokuPlaylistIE(InfoExtractor
):
167 IE_NAME
= 'duboku:list'
168 IE_DESC
= 'www.duboku.io entire series'
170 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
172 'url': 'https://w.duboku.io/voddetail/1575.html',
174 'id': 'startswith:1575',
177 'playlist_count': 12,
179 'url': 'https://w.duboku.io/voddetail/1554.html',
181 'id': 'startswith:1554',
184 'playlist_mincount': 30,
187 def _real_extract(self
, url
):
188 mobj
= self
._match
_valid
_url
(url
)
190 raise ExtractorError('Invalid URL: %s' % url
)
191 series_id
= mobj
.group('id')
192 fragment
= compat_urlparse
.urlparse(url
).fragment
194 webpage_url
= 'https://w.duboku.io/voddetail/%s.html' % series_id
195 webpage_html
= self
._download
_webpage
(webpage_url
, series_id
)
199 title
= _get_element_by_tag_and_attrib(webpage_html
, 'h1', 'class', 'title')
200 title
= unescapeHTML(title
.group('content')) if title
else None
202 title
= self
._html
_search
_meta
('keywords', webpage_html
)
204 title
= _get_element_by_tag_and_attrib(webpage_html
, 'title')
205 title
= unescapeHTML(title
.group('content')) if title
else None
210 for div
in _get_elements_by_tag_and_attrib(
211 webpage_html
, attribute
='id', value
='playlist\\d+', escape_value
=False):
212 playlist_id
= div
.group('value')
214 for a
in _get_elements_by_tag_and_attrib(
215 div
.group('content'), 'a', 'href', value
='[^\'"]+?', escape_value
=False):
217 'href': unescapeHTML(a
.group('value')),
218 'title': unescapeHTML(a
.group('content'))
220 playlists
[playlist_id
] = playlist
222 # select the specified playlist if url fragment exists
226 playlist
= playlists
.get(fragment
)
227 playlist_id
= fragment
229 first
= next(iter(playlists
.items()), None)
231 (playlist_id
, playlist
) = first
233 raise ExtractorError(
234 'Cannot find %s' % fragment
if fragment
else 'Cannot extract playlist')
237 return self
.playlist_result([
239 compat_urlparse
.urljoin('https://w.duboku.io', x
['href']),
240 ie
=DubokuIE
.ie_key(), video_title
=x
.get('title'))
241 for x
in playlist
], series_id
+ '#' + playlist_id
, title
)