]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/duboku.py
[extractor/youtube] Ignore incomplete data for comment threads by default (#7475)
[yt-dlp.git] / yt_dlp / extractor / duboku.py
CommitLineData
503406d4 1import re
2
3from .common import InfoExtractor
de4144a4 4from ..compat import compat_urlparse
7cc9d5b3 5from ..utils import (
6 clean_html,
7 extract_attributes,
8 ExtractorError,
9 get_elements_by_class,
10 int_or_none,
11 js_to_json,
12 smuggle_url,
13 unescapeHTML,
14)
503406d4 15
16
de4144a4 17def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
18 """Return the content of the tag with the specified attribute in the passed HTML document"""
19
20 if tag is None:
21 tag = '[a-zA-Z0-9:._-]+'
22 if attribute is None:
23 attribute = ''
24 else:
25 attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
26 if value is None:
27 value = ''
28 else:
29 value = re.escape(value) if escape_value else value
30 value = '=[\'"]?(?P<value>%s)[\'"]?' % value
31
32 retlist = []
33 for m in re.finditer(r'''(?xs)
34 <(?P<tag>%s)
35 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
36 %s%s
37 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38 \s*>
39 (?P<content>.*?)
40 </\1>
41 ''' % (tag, attribute, value), html):
42 retlist.append(m)
43
44 return retlist
45
46
47def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
48 retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
49 return retval[0] if retval else None
50
51
503406d4 52class DubokuIE(InfoExtractor):
de4144a4 53 IE_NAME = 'duboku'
5bbe631e 54 IE_DESC = 'www.duboku.io'
de4144a4 55
5bbe631e 56 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
503406d4 57 _TESTS = [{
5bbe631e 58 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
503406d4 59 'info_dict': {
60 'id': '1575-1-1',
5bbe631e 61 'ext': 'mp4',
7cc9d5b3 62 'series': '白色月光',
63 'title': 'contains:白色月光',
64 'season_number': 1,
65 'episode_number': 1,
5bbe631e 66 'season': 'Season 1',
67 'episode_id': '1',
68 'season_id': '1',
69 'episode': 'Episode 1',
7cc9d5b3 70 },
71 'params': {
72 'skip_download': 'm3u8 download',
73 },
74 }, {
5bbe631e 75 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
7cc9d5b3 76 'info_dict': {
77 'id': '1588-1-1',
5bbe631e 78 'ext': 'mp4',
7cc9d5b3 79 'series': '亲爱的自己',
5bbe631e 80 'title': 'contains:第1集',
7cc9d5b3 81 'season_number': 1,
82 'episode_number': 1,
5bbe631e 83 'episode': 'Episode 1',
84 'season': 'Season 1',
85 'episode_id': '1',
86 'season_id': '1',
503406d4 87 },
88 'params': {
89 'skip_download': 'm3u8 download',
90 },
91 }]
92
93 _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
94
95 def _real_extract(self, url):
96 video_id = self._match_id(url)
97 temp = video_id.split('-')
98 series_id = temp[0]
99 season_id = temp[1]
100 episode_id = temp[2]
101
5bbe631e 102 webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
503406d4 103 webpage_html = self._download_webpage(webpage_url, video_id)
104
105 # extract video url
106
107 player_data = self._search_regex(
108 self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
bf739292 109 player_data = self._parse_json(player_data, video_id, js_to_json)
503406d4 110
111 # extract title
112
113 temp = get_elements_by_class('title', webpage_html)
114 series_title = None
115 title = None
116 for html in temp:
117 mobj = re.search(r'<a\s+.*>(.*)</a>', html)
118 if mobj:
119 href = extract_attributes(mobj.group(0)).get('href')
120 if href:
121 mobj1 = re.search(r'/(\d+)\.html', href)
122 if mobj1 and mobj1.group(1) == series_id:
123 series_title = clean_html(mobj.group(0))
124 series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
125 title = clean_html(html)
126 title = re.sub(r'[\s\r\n\t]+', ' ', title)
127 break
128
bf739292 129 data_url = player_data.get('url')
130 if not data_url:
131 raise ExtractorError('Cannot find url in player_data')
503406d4 132 data_from = player_data.get('from')
133
134 # if it is an embedded iframe, maybe it's an external source
5bbe631e 135 headers = {'Referer': webpage_url}
503406d4 136 if data_from == 'iframe':
137 # use _type url_transparent to retain the meaningful details
138 # of the video.
139 return {
140 '_type': 'url_transparent',
5bbe631e 141 'url': smuggle_url(data_url, {'http_headers': headers}),
503406d4 142 'id': video_id,
143 'title': title,
144 'series': series_title,
145 'season_number': int_or_none(season_id),
146 'season_id': season_id,
147 'episode_number': int_or_none(episode_id),
148 'episode_id': episode_id,
149 }
150
5bbe631e 151 formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
503406d4 152
153 return {
154 'id': video_id,
155 'title': title,
156 'series': series_title,
157 'season_number': int_or_none(season_id),
158 'season_id': season_id,
159 'episode_number': int_or_none(episode_id),
160 'episode_id': episode_id,
161 'formats': formats,
5bbe631e 162 'http_headers': headers
503406d4 163 }
de4144a4 164
165
166class DubokuPlaylistIE(InfoExtractor):
167 IE_NAME = 'duboku:list'
5bbe631e 168 IE_DESC = 'www.duboku.io entire series'
de4144a4 169
5bbe631e 170 _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
d82b6697 171 _TESTS = [{
5bbe631e 172 'url': 'https://w.duboku.io/voddetail/1575.html',
d82b6697 173 'info_dict': {
7cc9d5b3 174 'id': 'startswith:1575',
d82b6697 175 'title': '白色月光',
176 },
177 'playlist_count': 12,
178 }, {
5bbe631e 179 'url': 'https://w.duboku.io/voddetail/1554.html',
d82b6697 180 'info_dict': {
7cc9d5b3 181 'id': 'startswith:1554',
d82b6697 182 'title': '以家人之名',
183 },
7cc9d5b3 184 'playlist_mincount': 30,
d82b6697 185 }]
de4144a4 186
187 def _real_extract(self, url):
5ad28e7f 188 mobj = self._match_valid_url(url)
de4144a4 189 if mobj is None:
190 raise ExtractorError('Invalid URL: %s' % url)
191 series_id = mobj.group('id')
192 fragment = compat_urlparse.urlparse(url).fragment
193
5bbe631e 194 webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
de4144a4 195 webpage_html = self._download_webpage(webpage_url, series_id)
196
197 # extract title
198
199 title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
200 title = unescapeHTML(title.group('content')) if title else None
201 if not title:
202 title = self._html_search_meta('keywords', webpage_html)
203 if not title:
204 title = _get_element_by_tag_and_attrib(webpage_html, 'title')
205 title = unescapeHTML(title.group('content')) if title else None
206
207 # extract playlists
208
209 playlists = {}
210 for div in _get_elements_by_tag_and_attrib(
211 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
212 playlist_id = div.group('value')
213 playlist = []
214 for a in _get_elements_by_tag_and_attrib(
215 div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
216 playlist.append({
217 'href': unescapeHTML(a.group('value')),
218 'title': unescapeHTML(a.group('content'))
219 })
220 playlists[playlist_id] = playlist
221
222 # select the specified playlist if url fragment exists
d82b6697 223 playlist = None
224 playlist_id = None
225 if fragment:
226 playlist = playlists.get(fragment)
227 playlist_id = fragment
228 else:
bf739292 229 first = next(iter(playlists.items()), None)
d82b6697 230 if first:
231 (playlist_id, playlist) = first
de4144a4 232 if not playlist:
233 raise ExtractorError(
234 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
235
236 # return url results
237 return self.playlist_result([
238 self.url_result(
5bbe631e 239 compat_urlparse.urljoin('https://w.duboku.io', x['href']),
bf739292 240 ie=DubokuIE.ie_key(), video_title=x.get('title'))
d82b6697 241 for x in playlist], series_id + '#' + playlist_id, title)