]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/duboku.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / duboku.py
CommitLineData
d3d4187d 1import base64
503406d4 2import re
d3d4187d 3import urllib.parse
503406d4 4
5from .common import InfoExtractor
7cc9d5b3 6from ..utils import (
e897bd82 7 ExtractorError,
7cc9d5b3 8 clean_html,
9 extract_attributes,
7cc9d5b3 10 get_elements_by_class,
11 int_or_none,
12 js_to_json,
13 smuggle_url,
14 unescapeHTML,
15)
503406d4 16
17
de4144a4 18def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
19 """Return the content of the tag with the specified attribute in the passed HTML document"""
20
21 if tag is None:
22 tag = '[a-zA-Z0-9:._-]+'
23 if attribute is None:
24 attribute = ''
25 else:
add96eb9 26 attribute = rf'\s+(?P<attribute>{re.escape(attribute)})'
de4144a4 27 if value is None:
28 value = ''
29 else:
30 value = re.escape(value) if escape_value else value
add96eb9 31 value = f'=[\'"]?(?P<value>{value})[\'"]?'
de4144a4 32
33 retlist = []
add96eb9 34 for m in re.finditer(rf'''(?xs)
35 <(?P<tag>{tag})
de4144a4 36 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
add96eb9 37 {attribute}{value}
de4144a4 38 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
39 \s*>
40 (?P<content>.*?)
41 </\1>
add96eb9 42 ''', html):
de4144a4 43 retlist.append(m)
44
45 return retlist
46
47
48def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
49 retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
50 return retval[0] if retval else None
51
52
503406d4 53class DubokuIE(InfoExtractor):
de4144a4 54 IE_NAME = 'duboku'
5bbe631e 55 IE_DESC = 'www.duboku.io'
de4144a4 56
5bbe631e 57 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
503406d4 58 _TESTS = [{
5bbe631e 59 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
503406d4 60 'info_dict': {
61 'id': '1575-1-1',
5bbe631e 62 'ext': 'mp4',
7cc9d5b3 63 'series': '白色月光',
64 'title': 'contains:白色月光',
65 'season_number': 1,
66 'episode_number': 1,
5bbe631e 67 'season': 'Season 1',
68 'episode_id': '1',
69 'season_id': '1',
70 'episode': 'Episode 1',
7cc9d5b3 71 },
72 'params': {
73 'skip_download': 'm3u8 download',
74 },
75 }, {
5bbe631e 76 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
7cc9d5b3 77 'info_dict': {
78 'id': '1588-1-1',
5bbe631e 79 'ext': 'mp4',
7cc9d5b3 80 'series': '亲爱的自己',
5bbe631e 81 'title': 'contains:第1集',
7cc9d5b3 82 'season_number': 1,
83 'episode_number': 1,
5bbe631e 84 'episode': 'Episode 1',
85 'season': 'Season 1',
86 'episode_id': '1',
87 'season_id': '1',
503406d4 88 },
89 'params': {
90 'skip_download': 'm3u8 download',
91 },
92 }]
93
94 _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
95
96 def _real_extract(self, url):
97 video_id = self._match_id(url)
98 temp = video_id.split('-')
99 series_id = temp[0]
100 season_id = temp[1]
101 episode_id = temp[2]
102
add96eb9 103 webpage_url = f'https://w.duboku.io/vodplay/{video_id}.html'
503406d4 104 webpage_html = self._download_webpage(webpage_url, video_id)
105
106 # extract video url
107
108 player_data = self._search_regex(
109 self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
bf739292 110 player_data = self._parse_json(player_data, video_id, js_to_json)
503406d4 111
112 # extract title
113
114 temp = get_elements_by_class('title', webpage_html)
115 series_title = None
116 title = None
117 for html in temp:
118 mobj = re.search(r'<a\s+.*>(.*)</a>', html)
119 if mobj:
120 href = extract_attributes(mobj.group(0)).get('href')
121 if href:
122 mobj1 = re.search(r'/(\d+)\.html', href)
123 if mobj1 and mobj1.group(1) == series_id:
124 series_title = clean_html(mobj.group(0))
125 series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
126 title = clean_html(html)
127 title = re.sub(r'[\s\r\n\t]+', ' ', title)
128 break
129
bf739292 130 data_url = player_data.get('url')
131 if not data_url:
132 raise ExtractorError('Cannot find url in player_data')
d3d4187d
D
133 player_encrypt = player_data.get('encrypt')
134 if player_encrypt == 1:
135 data_url = urllib.parse.unquote(data_url)
136 elif player_encrypt == 2:
137 data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
503406d4 138
139 # if it is an embedded iframe, maybe it's an external source
5bbe631e 140 headers = {'Referer': webpage_url}
d3d4187d 141 if player_data.get('from') == 'iframe':
503406d4 142 # use _type url_transparent to retain the meaningful details
143 # of the video.
144 return {
145 '_type': 'url_transparent',
f04b5bed 146 'url': smuggle_url(data_url, {'referer': webpage_url}),
503406d4 147 'id': video_id,
148 'title': title,
149 'series': series_title,
150 'season_number': int_or_none(season_id),
151 'season_id': season_id,
152 'episode_number': int_or_none(episode_id),
153 'episode_id': episode_id,
154 }
155
5bbe631e 156 formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
503406d4 157
158 return {
159 'id': video_id,
160 'title': title,
161 'series': series_title,
162 'season_number': int_or_none(season_id),
163 'season_id': season_id,
164 'episode_number': int_or_none(episode_id),
165 'episode_id': episode_id,
166 'formats': formats,
add96eb9 167 'http_headers': headers,
503406d4 168 }
de4144a4 169
170
171class DubokuPlaylistIE(InfoExtractor):
172 IE_NAME = 'duboku:list'
5bbe631e 173 IE_DESC = 'www.duboku.io entire series'
de4144a4 174
5bbe631e 175 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
d82b6697 176 _TESTS = [{
5bbe631e 177 'url': 'https://w.duboku.io/voddetail/1575.html',
d82b6697 178 'info_dict': {
7cc9d5b3 179 'id': 'startswith:1575',
d82b6697 180 'title': '白色月光',
181 },
182 'playlist_count': 12,
183 }, {
5bbe631e 184 'url': 'https://w.duboku.io/voddetail/1554.html',
d82b6697 185 'info_dict': {
7cc9d5b3 186 'id': 'startswith:1554',
d82b6697 187 'title': '以家人之名',
188 },
7cc9d5b3 189 'playlist_mincount': 30,
d82b6697 190 }]
de4144a4 191
192 def _real_extract(self, url):
5ad28e7f 193 mobj = self._match_valid_url(url)
de4144a4 194 if mobj is None:
add96eb9 195 raise ExtractorError(f'Invalid URL: {url}')
de4144a4 196 series_id = mobj.group('id')
add96eb9 197 fragment = urllib.parse.urlparse(url).fragment
de4144a4 198
add96eb9 199 webpage_url = f'https://w.duboku.io/voddetail/{series_id}.html'
de4144a4 200 webpage_html = self._download_webpage(webpage_url, series_id)
201
202 # extract title
203
204 title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
205 title = unescapeHTML(title.group('content')) if title else None
206 if not title:
207 title = self._html_search_meta('keywords', webpage_html)
208 if not title:
209 title = _get_element_by_tag_and_attrib(webpage_html, 'title')
210 title = unescapeHTML(title.group('content')) if title else None
211
212 # extract playlists
213
214 playlists = {}
215 for div in _get_elements_by_tag_and_attrib(
216 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
217 playlist_id = div.group('value')
218 playlist = []
219 for a in _get_elements_by_tag_and_attrib(
220 div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
221 playlist.append({
222 'href': unescapeHTML(a.group('value')),
add96eb9 223 'title': unescapeHTML(a.group('content')),
de4144a4 224 })
225 playlists[playlist_id] = playlist
226
227 # select the specified playlist if url fragment exists
d82b6697 228 playlist = None
229 playlist_id = None
230 if fragment:
231 playlist = playlists.get(fragment)
232 playlist_id = fragment
233 else:
bf739292 234 first = next(iter(playlists.items()), None)
d82b6697 235 if first:
236 (playlist_id, playlist) = first
de4144a4 237 if not playlist:
238 raise ExtractorError(
add96eb9 239 f'Cannot find {fragment}' if fragment else 'Cannot extract playlist')
de4144a4 240
241 # return url results
242 return self.playlist_result([
243 self.url_result(
add96eb9 244 urllib.parse.urljoin('https://w.duboku.io', x['href']),
bf739292 245 ie=DubokuIE.ie_key(), video_title=x.get('title'))
d82b6697 246 for x in playlist], series_id + '#' + playlist_id, title)