]>
Commit | Line | Data |
---|---|---|
a90641fe | 1 | import itertools |
2 | import re | |
3 | ||
4 | from .common import InfoExtractor | |
5 | ||
6 | ||
7 | class XimalayaBaseIE(InfoExtractor): | |
8 | _GEO_COUNTRIES = ['CN'] | |
9 | ||
10 | ||
11 | class XimalayaIE(XimalayaBaseIE): | |
12 | IE_NAME = 'ximalaya' | |
13 | IE_DESC = '喜马拉雅FM' | |
14 | _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)' | |
15 | _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' | |
16 | _TESTS = [ | |
17 | { | |
18 | 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', | |
19 | 'info_dict': { | |
20 | 'id': '47740352', | |
21 | 'ext': 'm4a', | |
22 | 'uploader': '小彬彬爱听书', | |
23 | 'uploader_id': 61425525, | |
24 | 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', | |
25 | 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', | |
26 | 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", | |
27 | 'thumbnails': [ | |
28 | { | |
29 | 'name': 'cover_url', | |
30 | 'url': r're:^https?://.*\.jpg$', | |
31 | }, | |
32 | { | |
33 | 'name': 'cover_url_142', | |
34 | 'url': r're:^https?://.*\.jpg$', | |
35 | 'width': 180, | |
36 | 'height': 180 | |
37 | } | |
38 | ], | |
39 | 'categories': ['renwen', '人文'], | |
40 | 'duration': 93, | |
41 | 'view_count': int, | |
42 | 'like_count': int, | |
43 | } | |
44 | }, | |
45 | { | |
46 | 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', | |
47 | 'info_dict': { | |
48 | 'id': '47740352', | |
49 | 'ext': 'm4a', | |
50 | 'uploader': '小彬彬爱听书', | |
51 | 'uploader_id': 61425525, | |
52 | 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', | |
53 | 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', | |
54 | 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", | |
55 | 'thumbnails': [ | |
56 | { | |
57 | 'name': 'cover_url', | |
58 | 'url': r're:^https?://.*\.jpg$', | |
59 | }, | |
60 | { | |
61 | 'name': 'cover_url_142', | |
62 | 'url': r're:^https?://.*\.jpg$', | |
63 | 'width': 180, | |
64 | 'height': 180 | |
65 | } | |
66 | ], | |
67 | 'categories': ['renwen', '人文'], | |
68 | 'duration': 93, | |
69 | 'view_count': int, | |
70 | 'like_count': int, | |
71 | } | |
72 | }, | |
73 | { | |
74 | 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', | |
75 | 'info_dict': { | |
76 | 'id': '15705996', | |
77 | 'ext': 'm4a', | |
78 | 'uploader': '李延隆老师', | |
79 | 'uploader_id': 11045267, | |
80 | 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', | |
81 | 'title': 'Lesson 1 Excuse me!', | |
82 | 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" | |
83 | "听录音,然后回答问题,这是谁的手袋?", | |
84 | 'thumbnails': [ | |
85 | { | |
86 | 'name': 'cover_url', | |
87 | 'url': r're:^https?://.*\.jpg$', | |
88 | }, | |
89 | { | |
90 | 'name': 'cover_url_142', | |
91 | 'url': r're:^https?://.*\.jpg$', | |
92 | 'width': 180, | |
93 | 'height': 180 | |
94 | } | |
95 | ], | |
96 | 'categories': ['train', '外语'], | |
97 | 'duration': 40, | |
98 | 'view_count': int, | |
99 | 'like_count': int, | |
100 | } | |
101 | }, | |
102 | ] | |
103 | ||
104 | def _real_extract(self, url): | |
105 | ||
106 | is_m = 'm.ximalaya' in url | |
107 | scheme = 'https' if url.startswith('https') else 'http' | |
108 | ||
109 | audio_id = self._match_id(url) | |
110 | webpage = self._download_webpage(url, audio_id, | |
111 | note='Download sound page for %s' % audio_id, | |
112 | errnote='Unable to get sound page') | |
113 | ||
114 | audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) | |
115 | audio_info = self._download_json(audio_info_file, audio_id, | |
116 | 'Downloading info json %s' % audio_info_file, | |
117 | 'Unable to download info file') | |
118 | ||
119 | formats = [] | |
120 | for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): | |
121 | if audio_info.get(k): | |
122 | formats.append({ | |
123 | 'format_id': bps, | |
124 | 'url': audio_info[k], | |
125 | }) | |
126 | ||
127 | thumbnails = [] | |
128 | for k in audio_info.keys(): | |
129 | # cover pics kyes like: cover_url', 'cover_url_142' | |
130 | if k.startswith('cover_url'): | |
131 | thumbnail = {'name': k, 'url': audio_info[k]} | |
132 | if k == 'cover_url_142': | |
133 | thumbnail['width'] = 180 | |
134 | thumbnail['height'] = 180 | |
135 | thumbnails.append(thumbnail) | |
136 | ||
137 | audio_uploader_id = audio_info.get('uid') | |
138 | ||
139 | if is_m: | |
140 | audio_description = self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>', | |
141 | webpage, 'audio_description', fatal=False) | |
142 | else: | |
143 | audio_description = self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)', | |
144 | webpage, 'audio_description', fatal=False) | |
145 | ||
146 | if not audio_description: | |
147 | audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) | |
148 | audio_description = self._download_webpage(audio_description_file, audio_id, | |
149 | note='Downloading description file %s' % audio_description_file, | |
150 | errnote='Unable to download descrip file', | |
151 | fatal=False) | |
152 | audio_description = audio_description.strip() if audio_description else None | |
153 | ||
154 | return { | |
155 | 'id': audio_id, | |
156 | 'uploader': audio_info.get('nickname'), | |
157 | 'uploader_id': audio_uploader_id, | |
158 | 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, | |
159 | 'title': audio_info['title'], | |
160 | 'thumbnails': thumbnails, | |
161 | 'description': audio_description, | |
162 | 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), | |
163 | 'duration': audio_info.get('duration'), | |
164 | 'view_count': audio_info.get('play_count'), | |
165 | 'like_count': audio_info.get('favorites_count'), | |
166 | 'formats': formats, | |
167 | } | |
168 | ||
169 | ||
170 | class XimalayaAlbumIE(XimalayaBaseIE): | |
171 | IE_NAME = 'ximalaya:album' | |
172 | IE_DESC = '喜马拉雅FM 专辑' | |
173 | _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)' | |
174 | _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' | |
175 | _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' | |
176 | _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">' | |
177 | _TESTS = [{ | |
178 | 'url': 'http://www.ximalaya.com/61425525/album/5534601/', | |
179 | 'info_dict': { | |
180 | 'title': '唐诗三百首(含赏析)', | |
181 | 'id': '5534601', | |
182 | }, | |
183 | 'playlist_count': 312, | |
184 | }, { | |
185 | 'url': 'http://m.ximalaya.com/61425525/album/5534601', | |
186 | 'info_dict': { | |
187 | 'title': '唐诗三百首(含赏析)', | |
188 | 'id': '5534601', | |
189 | }, | |
190 | 'playlist_count': 312, | |
191 | }, | |
192 | ] | |
193 | ||
194 | def _real_extract(self, url): | |
195 | self.scheme = scheme = 'https' if url.startswith('https') else 'http' | |
196 | ||
5ad28e7f | 197 | mobj = self._match_valid_url(url) |
a90641fe | 198 | uid, playlist_id = mobj.group('uid'), mobj.group('id') |
199 | ||
200 | webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, | |
201 | note='Download album page for %s' % playlist_id, | |
202 | errnote='Unable to get album info') | |
203 | ||
204 | title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', | |
205 | webpage, 'title', fatal=False) | |
206 | ||
207 | return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) | |
208 | ||
209 | def _entries(self, page, playlist_id, uid): | |
210 | html = page | |
211 | for page_num in itertools.count(1): | |
212 | for entry in self._process_page(html, uid): | |
213 | yield entry | |
214 | ||
215 | next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', | |
216 | html, 'list_next_url', default=None, group='more') | |
217 | if not next_url: | |
218 | break | |
219 | ||
220 | next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) | |
221 | html = self._download_webpage(next_full_url, playlist_id) | |
222 | ||
223 | def _process_page(self, html, uid): | |
224 | find_from = html.index('album_soundlist') | |
225 | for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): | |
226 | yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), | |
227 | XimalayaIE.ie_key(), | |
228 | mobj.group('id'), | |
229 | mobj.group('title')) |