]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/duboku.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / duboku.py
1 import base64
2 import re
3 import urllib.parse
4
5 from .common import InfoExtractor
6 from ..compat import compat_urlparse
7 from ..utils import (
8 ExtractorError,
9 clean_html,
10 extract_attributes,
11 get_elements_by_class,
12 int_or_none,
13 js_to_json,
14 smuggle_url,
15 unescapeHTML,
16 )
17
18
19 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
20 """Return the content of the tag with the specified attribute in the passed HTML document"""
21
22 if tag is None:
23 tag = '[a-zA-Z0-9:._-]+'
24 if attribute is None:
25 attribute = ''
26 else:
27 attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
28 if value is None:
29 value = ''
30 else:
31 value = re.escape(value) if escape_value else value
32 value = '=[\'"]?(?P<value>%s)[\'"]?' % value
33
34 retlist = []
35 for m in re.finditer(r'''(?xs)
36 <(?P<tag>%s)
37 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38 %s%s
39 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
40 \s*>
41 (?P<content>.*?)
42 </\1>
43 ''' % (tag, attribute, value), html):
44 retlist.append(m)
45
46 return retlist
47
48
49 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
50 retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
51 return retval[0] if retval else None
52
53
54 class DubokuIE(InfoExtractor):
55 IE_NAME = 'duboku'
56 IE_DESC = 'www.duboku.io'
57
58 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
59 _TESTS = [{
60 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
61 'info_dict': {
62 'id': '1575-1-1',
63 'ext': 'mp4',
64 'series': '白色月光',
65 'title': 'contains:白色月光',
66 'season_number': 1,
67 'episode_number': 1,
68 'season': 'Season 1',
69 'episode_id': '1',
70 'season_id': '1',
71 'episode': 'Episode 1',
72 },
73 'params': {
74 'skip_download': 'm3u8 download',
75 },
76 }, {
77 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
78 'info_dict': {
79 'id': '1588-1-1',
80 'ext': 'mp4',
81 'series': '亲爱的自己',
82 'title': 'contains:第1集',
83 'season_number': 1,
84 'episode_number': 1,
85 'episode': 'Episode 1',
86 'season': 'Season 1',
87 'episode_id': '1',
88 'season_id': '1',
89 },
90 'params': {
91 'skip_download': 'm3u8 download',
92 },
93 }]
94
95 _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
96
97 def _real_extract(self, url):
98 video_id = self._match_id(url)
99 temp = video_id.split('-')
100 series_id = temp[0]
101 season_id = temp[1]
102 episode_id = temp[2]
103
104 webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
105 webpage_html = self._download_webpage(webpage_url, video_id)
106
107 # extract video url
108
109 player_data = self._search_regex(
110 self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
111 player_data = self._parse_json(player_data, video_id, js_to_json)
112
113 # extract title
114
115 temp = get_elements_by_class('title', webpage_html)
116 series_title = None
117 title = None
118 for html in temp:
119 mobj = re.search(r'<a\s+.*>(.*)</a>', html)
120 if mobj:
121 href = extract_attributes(mobj.group(0)).get('href')
122 if href:
123 mobj1 = re.search(r'/(\d+)\.html', href)
124 if mobj1 and mobj1.group(1) == series_id:
125 series_title = clean_html(mobj.group(0))
126 series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
127 title = clean_html(html)
128 title = re.sub(r'[\s\r\n\t]+', ' ', title)
129 break
130
131 data_url = player_data.get('url')
132 if not data_url:
133 raise ExtractorError('Cannot find url in player_data')
134 player_encrypt = player_data.get('encrypt')
135 if player_encrypt == 1:
136 data_url = urllib.parse.unquote(data_url)
137 elif player_encrypt == 2:
138 data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
139
140 # if it is an embedded iframe, maybe it's an external source
141 headers = {'Referer': webpage_url}
142 if player_data.get('from') == 'iframe':
143 # use _type url_transparent to retain the meaningful details
144 # of the video.
145 return {
146 '_type': 'url_transparent',
147 'url': smuggle_url(data_url, {'referer': webpage_url}),
148 'id': video_id,
149 'title': title,
150 'series': series_title,
151 'season_number': int_or_none(season_id),
152 'season_id': season_id,
153 'episode_number': int_or_none(episode_id),
154 'episode_id': episode_id,
155 }
156
157 formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
158
159 return {
160 'id': video_id,
161 'title': title,
162 'series': series_title,
163 'season_number': int_or_none(season_id),
164 'season_id': season_id,
165 'episode_number': int_or_none(episode_id),
166 'episode_id': episode_id,
167 'formats': formats,
168 'http_headers': headers
169 }
170
171
172 class DubokuPlaylistIE(InfoExtractor):
173 IE_NAME = 'duboku:list'
174 IE_DESC = 'www.duboku.io entire series'
175
176 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
177 _TESTS = [{
178 'url': 'https://w.duboku.io/voddetail/1575.html',
179 'info_dict': {
180 'id': 'startswith:1575',
181 'title': '白色月光',
182 },
183 'playlist_count': 12,
184 }, {
185 'url': 'https://w.duboku.io/voddetail/1554.html',
186 'info_dict': {
187 'id': 'startswith:1554',
188 'title': '以家人之名',
189 },
190 'playlist_mincount': 30,
191 }]
192
193 def _real_extract(self, url):
194 mobj = self._match_valid_url(url)
195 if mobj is None:
196 raise ExtractorError('Invalid URL: %s' % url)
197 series_id = mobj.group('id')
198 fragment = compat_urlparse.urlparse(url).fragment
199
200 webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
201 webpage_html = self._download_webpage(webpage_url, series_id)
202
203 # extract title
204
205 title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
206 title = unescapeHTML(title.group('content')) if title else None
207 if not title:
208 title = self._html_search_meta('keywords', webpage_html)
209 if not title:
210 title = _get_element_by_tag_and_attrib(webpage_html, 'title')
211 title = unescapeHTML(title.group('content')) if title else None
212
213 # extract playlists
214
215 playlists = {}
216 for div in _get_elements_by_tag_and_attrib(
217 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
218 playlist_id = div.group('value')
219 playlist = []
220 for a in _get_elements_by_tag_and_attrib(
221 div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
222 playlist.append({
223 'href': unescapeHTML(a.group('value')),
224 'title': unescapeHTML(a.group('content'))
225 })
226 playlists[playlist_id] = playlist
227
228 # select the specified playlist if url fragment exists
229 playlist = None
230 playlist_id = None
231 if fragment:
232 playlist = playlists.get(fragment)
233 playlist_id = fragment
234 else:
235 first = next(iter(playlists.items()), None)
236 if first:
237 (playlist_id, playlist) = first
238 if not playlist:
239 raise ExtractorError(
240 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
241
242 # return url results
243 return self.playlist_result([
244 self.url_result(
245 compat_urlparse.urljoin('https://w.duboku.io', x['href']),
246 ie=DubokuIE.ie_key(), video_title=x.get('title'))
247 for x in playlist], series_id + '#' + playlist_id, title)