]>
jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/duboku.py
5 from .common
import InfoExtractor
6 from ..compat
import compat_urlparse
11 get_elements_by_class
,
19 def _get_elements_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
20 """Return the content of the tag with the specified attribute in the passed HTML document"""
23 tag
= '[a-zA-Z0-9:._-]+'
27 attribute
= r
'\s+(?P<attribute>%s)' % re
.escape(attribute
)
31 value
= re
.escape(value
) if escape_value
else value
32 value
= '=[\'"]?(?P<value>%s)[\'"]?' % value
35 for m
in re
.finditer(r
'''(?xs)
37 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
39 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
43 ''' % (tag
, attribute
, value
), html
):
49 def _get_element_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
50 retval
= _get_elements_by_tag_and_attrib(html
, tag
, attribute
, value
, escape_value
)
51 return retval
[0] if retval
else None
54 class DubokuIE(InfoExtractor
):
56 IE_DESC
= 'www.duboku.io'
58 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
60 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
65 'title': 'contains:白色月光',
71 'episode': 'Episode 1',
74 'skip_download': 'm3u8 download',
77 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
82 'title': 'contains:第1集',
85 'episode': 'Episode 1',
91 'skip_download': 'm3u8 download',
95 _PLAYER_DATA_PATTERN
= r
'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
97 def _real_extract(self
, url
):
98 video_id
= self
._match
_id
(url
)
99 temp
= video_id
.split('-')
104 webpage_url
= 'https://w.duboku.io/vodplay/%s.html' % video_id
105 webpage_html
= self
._download
_webpage
(webpage_url
, video_id
)
109 player_data
= self
._search
_regex
(
110 self
._PLAYER
_DATA
_PATTERN
, webpage_html
, 'player_data')
111 player_data
= self
._parse
_json
(player_data
, video_id
, js_to_json
)
115 temp
= get_elements_by_class('title', webpage_html
)
119 mobj
= re
.search(r
'<a\s+.*>(.*)</a>', html
)
121 href
= extract_attributes(mobj
.group(0)).get('href')
123 mobj1
= re
.search(r
'/(\d+)\.html', href
)
124 if mobj1
and mobj1
.group(1) == series_id
:
125 series_title
= clean_html(mobj
.group(0))
126 series_title
= re
.sub(r
'[\s\r\n\t]+', ' ', series_title
)
127 title
= clean_html(html
)
128 title
= re
.sub(r
'[\s\r\n\t]+', ' ', title
)
131 data_url
= player_data
.get('url')
133 raise ExtractorError('Cannot find url in player_data')
134 player_encrypt
= player_data
.get('encrypt')
135 if player_encrypt
== 1:
136 data_url
= urllib
.parse
.unquote(data_url
)
137 elif player_encrypt
== 2:
138 data_url
= urllib
.parse
.unquote(base64
.b64decode(data_url
).decode('ascii'))
140 # if it is an embedded iframe, maybe it's an external source
141 headers
= {'Referer': webpage_url}
142 if player_data
.get('from') == 'iframe':
143 # use _type url_transparent to retain the meaningful details
146 '_type': 'url_transparent',
147 'url': smuggle_url(data_url
, {'referer': webpage_url}
),
150 'series': series_title
,
151 'season_number': int_or_none(season_id
),
152 'season_id': season_id
,
153 'episode_number': int_or_none(episode_id
),
154 'episode_id': episode_id
,
157 formats
= self
._extract
_m
3u8_formats
(data_url
, video_id
, 'mp4', headers
=headers
)
162 'series': series_title
,
163 'season_number': int_or_none(season_id
),
164 'season_id': season_id
,
165 'episode_number': int_or_none(episode_id
),
166 'episode_id': episode_id
,
168 'http_headers': headers
172 class DubokuPlaylistIE(InfoExtractor
):
173 IE_NAME
= 'duboku:list'
174 IE_DESC
= 'www.duboku.io entire series'
176 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
178 'url': 'https://w.duboku.io/voddetail/1575.html',
180 'id': 'startswith:1575',
183 'playlist_count': 12,
185 'url': 'https://w.duboku.io/voddetail/1554.html',
187 'id': 'startswith:1554',
190 'playlist_mincount': 30,
193 def _real_extract(self
, url
):
194 mobj
= self
._match
_valid
_url
(url
)
196 raise ExtractorError('Invalid URL: %s' % url
)
197 series_id
= mobj
.group('id')
198 fragment
= compat_urlparse
.urlparse(url
).fragment
200 webpage_url
= 'https://w.duboku.io/voddetail/%s.html' % series_id
201 webpage_html
= self
._download
_webpage
(webpage_url
, series_id
)
205 title
= _get_element_by_tag_and_attrib(webpage_html
, 'h1', 'class', 'title')
206 title
= unescapeHTML(title
.group('content')) if title
else None
208 title
= self
._html
_search
_meta
('keywords', webpage_html
)
210 title
= _get_element_by_tag_and_attrib(webpage_html
, 'title')
211 title
= unescapeHTML(title
.group('content')) if title
else None
216 for div
in _get_elements_by_tag_and_attrib(
217 webpage_html
, attribute
='id', value
='playlist\\d+', escape_value
=False):
218 playlist_id
= div
.group('value')
220 for a
in _get_elements_by_tag_and_attrib(
221 div
.group('content'), 'a', 'href', value
='[^\'"]+?', escape_value
=False):
223 'href': unescapeHTML(a
.group('value')),
224 'title': unescapeHTML(a
.group('content'))
226 playlists
[playlist_id
] = playlist
228 # select the specified playlist if url fragment exists
232 playlist
= playlists
.get(fragment
)
233 playlist_id
= fragment
235 first
= next(iter(playlists
.items()), None)
237 (playlist_id
, playlist
) = first
239 raise ExtractorError(
240 'Cannot find %s' % fragment
if fragment
else 'Cannot extract playlist')
243 return self
.playlist_result([
245 compat_urlparse
.urljoin('https://w.duboku.io', x
['href']),
246 ie
=DubokuIE
.ie_key(), video_title
=x
.get('title'))
247 for x
in playlist
], series_id
+ '#' + playlist_id
, title
)