]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/duboku.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / duboku.py
1 import base64
2 import re
3 import urllib.parse
4
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 clean_html,
9 extract_attributes,
10 get_elements_by_class,
11 int_or_none,
12 js_to_json,
13 smuggle_url,
14 unescapeHTML,
15 )
16
17
18 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
19 """Return the content of the tag with the specified attribute in the passed HTML document"""
20
21 if tag is None:
22 tag = '[a-zA-Z0-9:._-]+'
23 if attribute is None:
24 attribute = ''
25 else:
26 attribute = rf'\s+(?P<attribute>{re.escape(attribute)})'
27 if value is None:
28 value = ''
29 else:
30 value = re.escape(value) if escape_value else value
31 value = f'=[\'"]?(?P<value>{value})[\'"]?'
32
33 retlist = []
34 for m in re.finditer(rf'''(?xs)
35 <(?P<tag>{tag})
36 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
37 {attribute}{value}
38 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
39 \s*>
40 (?P<content>.*?)
41 </\1>
42 ''', html):
43 retlist.append(m)
44
45 return retlist
46
47
48 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
49 retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
50 return retval[0] if retval else None
51
52
53 class DubokuIE(InfoExtractor):
54 IE_NAME = 'duboku'
55 IE_DESC = 'www.duboku.io'
56
57 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
58 _TESTS = [{
59 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
60 'info_dict': {
61 'id': '1575-1-1',
62 'ext': 'mp4',
63 'series': '白色月光',
64 'title': 'contains:白色月光',
65 'season_number': 1,
66 'episode_number': 1,
67 'season': 'Season 1',
68 'episode_id': '1',
69 'season_id': '1',
70 'episode': 'Episode 1',
71 },
72 'params': {
73 'skip_download': 'm3u8 download',
74 },
75 }, {
76 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
77 'info_dict': {
78 'id': '1588-1-1',
79 'ext': 'mp4',
80 'series': '亲爱的自己',
81 'title': 'contains:第1集',
82 'season_number': 1,
83 'episode_number': 1,
84 'episode': 'Episode 1',
85 'season': 'Season 1',
86 'episode_id': '1',
87 'season_id': '1',
88 },
89 'params': {
90 'skip_download': 'm3u8 download',
91 },
92 }]
93
94 _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
95
96 def _real_extract(self, url):
97 video_id = self._match_id(url)
98 temp = video_id.split('-')
99 series_id = temp[0]
100 season_id = temp[1]
101 episode_id = temp[2]
102
103 webpage_url = f'https://w.duboku.io/vodplay/{video_id}.html'
104 webpage_html = self._download_webpage(webpage_url, video_id)
105
106 # extract video url
107
108 player_data = self._search_regex(
109 self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
110 player_data = self._parse_json(player_data, video_id, js_to_json)
111
112 # extract title
113
114 temp = get_elements_by_class('title', webpage_html)
115 series_title = None
116 title = None
117 for html in temp:
118 mobj = re.search(r'<a\s+.*>(.*)</a>', html)
119 if mobj:
120 href = extract_attributes(mobj.group(0)).get('href')
121 if href:
122 mobj1 = re.search(r'/(\d+)\.html', href)
123 if mobj1 and mobj1.group(1) == series_id:
124 series_title = clean_html(mobj.group(0))
125 series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
126 title = clean_html(html)
127 title = re.sub(r'[\s\r\n\t]+', ' ', title)
128 break
129
130 data_url = player_data.get('url')
131 if not data_url:
132 raise ExtractorError('Cannot find url in player_data')
133 player_encrypt = player_data.get('encrypt')
134 if player_encrypt == 1:
135 data_url = urllib.parse.unquote(data_url)
136 elif player_encrypt == 2:
137 data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
138
139 # if it is an embedded iframe, maybe it's an external source
140 headers = {'Referer': webpage_url}
141 if player_data.get('from') == 'iframe':
142 # use _type url_transparent to retain the meaningful details
143 # of the video.
144 return {
145 '_type': 'url_transparent',
146 'url': smuggle_url(data_url, {'referer': webpage_url}),
147 'id': video_id,
148 'title': title,
149 'series': series_title,
150 'season_number': int_or_none(season_id),
151 'season_id': season_id,
152 'episode_number': int_or_none(episode_id),
153 'episode_id': episode_id,
154 }
155
156 formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
157
158 return {
159 'id': video_id,
160 'title': title,
161 'series': series_title,
162 'season_number': int_or_none(season_id),
163 'season_id': season_id,
164 'episode_number': int_or_none(episode_id),
165 'episode_id': episode_id,
166 'formats': formats,
167 'http_headers': headers,
168 }
169
170
171 class DubokuPlaylistIE(InfoExtractor):
172 IE_NAME = 'duboku:list'
173 IE_DESC = 'www.duboku.io entire series'
174
175 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
176 _TESTS = [{
177 'url': 'https://w.duboku.io/voddetail/1575.html',
178 'info_dict': {
179 'id': 'startswith:1575',
180 'title': '白色月光',
181 },
182 'playlist_count': 12,
183 }, {
184 'url': 'https://w.duboku.io/voddetail/1554.html',
185 'info_dict': {
186 'id': 'startswith:1554',
187 'title': '以家人之名',
188 },
189 'playlist_mincount': 30,
190 }]
191
192 def _real_extract(self, url):
193 mobj = self._match_valid_url(url)
194 if mobj is None:
195 raise ExtractorError(f'Invalid URL: {url}')
196 series_id = mobj.group('id')
197 fragment = urllib.parse.urlparse(url).fragment
198
199 webpage_url = f'https://w.duboku.io/voddetail/{series_id}.html'
200 webpage_html = self._download_webpage(webpage_url, series_id)
201
202 # extract title
203
204 title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
205 title = unescapeHTML(title.group('content')) if title else None
206 if not title:
207 title = self._html_search_meta('keywords', webpage_html)
208 if not title:
209 title = _get_element_by_tag_and_attrib(webpage_html, 'title')
210 title = unescapeHTML(title.group('content')) if title else None
211
212 # extract playlists
213
214 playlists = {}
215 for div in _get_elements_by_tag_and_attrib(
216 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
217 playlist_id = div.group('value')
218 playlist = []
219 for a in _get_elements_by_tag_and_attrib(
220 div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
221 playlist.append({
222 'href': unescapeHTML(a.group('value')),
223 'title': unescapeHTML(a.group('content')),
224 })
225 playlists[playlist_id] = playlist
226
227 # select the specified playlist if url fragment exists
228 playlist = None
229 playlist_id = None
230 if fragment:
231 playlist = playlists.get(fragment)
232 playlist_id = fragment
233 else:
234 first = next(iter(playlists.items()), None)
235 if first:
236 (playlist_id, playlist) = first
237 if not playlist:
238 raise ExtractorError(
239 f'Cannot find {fragment}' if fragment else 'Cannot extract playlist')
240
241 # return url results
242 return self.playlist_result([
243 self.url_result(
244 urllib.parse.urljoin('https://w.duboku.io', x['href']),
245 ie=DubokuIE.ie_key(), video_title=x.get('title'))
246 for x in playlist], series_id + '#' + playlist_id, title)