]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/duboku.py
[spotify] Detect iframe embeds (#3430)
[yt-dlp.git] / yt_dlp / extractor / duboku.py
1 import re
2
3 from .common import InfoExtractor
4 from ..compat import compat_urlparse
5 from ..utils import (
6 clean_html,
7 extract_attributes,
8 ExtractorError,
9 get_elements_by_class,
10 int_or_none,
11 js_to_json,
12 smuggle_url,
13 unescapeHTML,
14 )
15
16
17 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
18 """Return the content of the tag with the specified attribute in the passed HTML document"""
19
20 if tag is None:
21 tag = '[a-zA-Z0-9:._-]+'
22 if attribute is None:
23 attribute = ''
24 else:
25 attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
26 if value is None:
27 value = ''
28 else:
29 value = re.escape(value) if escape_value else value
30 value = '=[\'"]?(?P<value>%s)[\'"]?' % value
31
32 retlist = []
33 for m in re.finditer(r'''(?xs)
34 <(?P<tag>%s)
35 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
36 %s%s
37 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38 \s*>
39 (?P<content>.*?)
40 </\1>
41 ''' % (tag, attribute, value), html):
42 retlist.append(m)
43
44 return retlist
45
46
47 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
48 retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
49 return retval[0] if retval else None
50
51
52 class DubokuIE(InfoExtractor):
53 IE_NAME = 'duboku'
54 IE_DESC = 'www.duboku.co'
55
56 _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
57 _TESTS = [{
58 'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
59 'info_dict': {
60 'id': '1575-1-1',
61 'ext': 'ts',
62 'series': '白色月光',
63 'title': 'contains:白色月光',
64 'season_number': 1,
65 'episode_number': 1,
66 },
67 'params': {
68 'skip_download': 'm3u8 download',
69 },
70 }, {
71 'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
72 'info_dict': {
73 'id': '1588-1-1',
74 'ext': 'ts',
75 'series': '亲爱的自己',
76 'title': 'contains:预告片',
77 'season_number': 1,
78 'episode_number': 1,
79 },
80 'params': {
81 'skip_download': 'm3u8 download',
82 },
83 }]
84
85 _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
86
87 def _real_extract(self, url):
88 video_id = self._match_id(url)
89 temp = video_id.split('-')
90 series_id = temp[0]
91 season_id = temp[1]
92 episode_id = temp[2]
93
94 webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id
95 webpage_html = self._download_webpage(webpage_url, video_id)
96
97 # extract video url
98
99 player_data = self._search_regex(
100 self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
101 player_data = self._parse_json(player_data, video_id, js_to_json)
102
103 # extract title
104
105 temp = get_elements_by_class('title', webpage_html)
106 series_title = None
107 title = None
108 for html in temp:
109 mobj = re.search(r'<a\s+.*>(.*)</a>', html)
110 if mobj:
111 href = extract_attributes(mobj.group(0)).get('href')
112 if href:
113 mobj1 = re.search(r'/(\d+)\.html', href)
114 if mobj1 and mobj1.group(1) == series_id:
115 series_title = clean_html(mobj.group(0))
116 series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
117 title = clean_html(html)
118 title = re.sub(r'[\s\r\n\t]+', ' ', title)
119 break
120
121 data_url = player_data.get('url')
122 if not data_url:
123 raise ExtractorError('Cannot find url in player_data')
124 data_from = player_data.get('from')
125
126 # if it is an embedded iframe, maybe it's an external source
127 if data_from == 'iframe':
128 # use _type url_transparent to retain the meaningful details
129 # of the video.
130 return {
131 '_type': 'url_transparent',
132 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}),
133 'id': video_id,
134 'title': title,
135 'series': series_title,
136 'season_number': int_or_none(season_id),
137 'season_id': season_id,
138 'episode_number': int_or_none(episode_id),
139 'episode_id': episode_id,
140 }
141
142 formats = self._extract_m3u8_formats(data_url, video_id, 'mp4')
143
144 return {
145 'id': video_id,
146 'title': title,
147 'series': series_title,
148 'season_number': int_or_none(season_id),
149 'season_id': season_id,
150 'episode_number': int_or_none(episode_id),
151 'episode_id': episode_id,
152 'formats': formats,
153 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
154 }
155
156
157 class DubokuPlaylistIE(InfoExtractor):
158 IE_NAME = 'duboku:list'
159 IE_DESC = 'www.duboku.co entire series'
160
161 _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
162 _TESTS = [{
163 'url': 'https://www.duboku.co/voddetail/1575.html',
164 'info_dict': {
165 'id': 'startswith:1575',
166 'title': '白色月光',
167 },
168 'playlist_count': 12,
169 }, {
170 'url': 'https://www.duboku.co/voddetail/1554.html',
171 'info_dict': {
172 'id': 'startswith:1554',
173 'title': '以家人之名',
174 },
175 'playlist_mincount': 30,
176 }, {
177 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
178 'info_dict': {
179 'id': '1554#playlist2',
180 'title': '以家人之名',
181 },
182 'playlist_mincount': 27,
183 }]
184
185 def _real_extract(self, url):
186 mobj = self._match_valid_url(url)
187 if mobj is None:
188 raise ExtractorError('Invalid URL: %s' % url)
189 series_id = mobj.group('id')
190 fragment = compat_urlparse.urlparse(url).fragment
191
192 webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
193 webpage_html = self._download_webpage(webpage_url, series_id)
194
195 # extract title
196
197 title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
198 title = unescapeHTML(title.group('content')) if title else None
199 if not title:
200 title = self._html_search_meta('keywords', webpage_html)
201 if not title:
202 title = _get_element_by_tag_and_attrib(webpage_html, 'title')
203 title = unescapeHTML(title.group('content')) if title else None
204
205 # extract playlists
206
207 playlists = {}
208 for div in _get_elements_by_tag_and_attrib(
209 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
210 playlist_id = div.group('value')
211 playlist = []
212 for a in _get_elements_by_tag_and_attrib(
213 div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
214 playlist.append({
215 'href': unescapeHTML(a.group('value')),
216 'title': unescapeHTML(a.group('content'))
217 })
218 playlists[playlist_id] = playlist
219
220 # select the specified playlist if url fragment exists
221 playlist = None
222 playlist_id = None
223 if fragment:
224 playlist = playlists.get(fragment)
225 playlist_id = fragment
226 else:
227 first = next(iter(playlists.items()), None)
228 if first:
229 (playlist_id, playlist) = first
230 if not playlist:
231 raise ExtractorError(
232 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
233
234 # return url results
235 return self.playlist_result([
236 self.url_result(
237 compat_urlparse.urljoin('https://www.duboku.co', x['href']),
238 ie=DubokuIE.ie_key(), video_title=x.get('title'))
239 for x in playlist], series_id + '#' + playlist_id, title)