]>
Commit | Line | Data |
---|---|---|
a90641fe | 1 | # coding: utf-8 |
2 | ||
3 | from __future__ import unicode_literals | |
4 | ||
5 | import itertools | |
6 | import re | |
7 | ||
8 | from .common import InfoExtractor | |
9 | ||
10 | ||
11 | class XimalayaBaseIE(InfoExtractor): | |
12 | _GEO_COUNTRIES = ['CN'] | |
13 | ||
14 | ||
15 | class XimalayaIE(XimalayaBaseIE): | |
16 | IE_NAME = 'ximalaya' | |
17 | IE_DESC = '喜马拉雅FM' | |
18 | _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)' | |
19 | _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' | |
20 | _TESTS = [ | |
21 | { | |
22 | 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', | |
23 | 'info_dict': { | |
24 | 'id': '47740352', | |
25 | 'ext': 'm4a', | |
26 | 'uploader': '小彬彬爱听书', | |
27 | 'uploader_id': 61425525, | |
28 | 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', | |
29 | 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', | |
30 | 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", | |
31 | 'thumbnails': [ | |
32 | { | |
33 | 'name': 'cover_url', | |
34 | 'url': r're:^https?://.*\.jpg$', | |
35 | }, | |
36 | { | |
37 | 'name': 'cover_url_142', | |
38 | 'url': r're:^https?://.*\.jpg$', | |
39 | 'width': 180, | |
40 | 'height': 180 | |
41 | } | |
42 | ], | |
43 | 'categories': ['renwen', '人文'], | |
44 | 'duration': 93, | |
45 | 'view_count': int, | |
46 | 'like_count': int, | |
47 | } | |
48 | }, | |
49 | { | |
50 | 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', | |
51 | 'info_dict': { | |
52 | 'id': '47740352', | |
53 | 'ext': 'm4a', | |
54 | 'uploader': '小彬彬爱听书', | |
55 | 'uploader_id': 61425525, | |
56 | 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', | |
57 | 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', | |
58 | 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", | |
59 | 'thumbnails': [ | |
60 | { | |
61 | 'name': 'cover_url', | |
62 | 'url': r're:^https?://.*\.jpg$', | |
63 | }, | |
64 | { | |
65 | 'name': 'cover_url_142', | |
66 | 'url': r're:^https?://.*\.jpg$', | |
67 | 'width': 180, | |
68 | 'height': 180 | |
69 | } | |
70 | ], | |
71 | 'categories': ['renwen', '人文'], | |
72 | 'duration': 93, | |
73 | 'view_count': int, | |
74 | 'like_count': int, | |
75 | } | |
76 | }, | |
77 | { | |
78 | 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', | |
79 | 'info_dict': { | |
80 | 'id': '15705996', | |
81 | 'ext': 'm4a', | |
82 | 'uploader': '李延隆老师', | |
83 | 'uploader_id': 11045267, | |
84 | 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', | |
85 | 'title': 'Lesson 1 Excuse me!', | |
86 | 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" | |
87 | "听录音,然后回答问题,这是谁的手袋?", | |
88 | 'thumbnails': [ | |
89 | { | |
90 | 'name': 'cover_url', | |
91 | 'url': r're:^https?://.*\.jpg$', | |
92 | }, | |
93 | { | |
94 | 'name': 'cover_url_142', | |
95 | 'url': r're:^https?://.*\.jpg$', | |
96 | 'width': 180, | |
97 | 'height': 180 | |
98 | } | |
99 | ], | |
100 | 'categories': ['train', '外语'], | |
101 | 'duration': 40, | |
102 | 'view_count': int, | |
103 | 'like_count': int, | |
104 | } | |
105 | }, | |
106 | ] | |
107 | ||
108 | def _real_extract(self, url): | |
109 | ||
110 | is_m = 'm.ximalaya' in url | |
111 | scheme = 'https' if url.startswith('https') else 'http' | |
112 | ||
113 | audio_id = self._match_id(url) | |
114 | webpage = self._download_webpage(url, audio_id, | |
115 | note='Download sound page for %s' % audio_id, | |
116 | errnote='Unable to get sound page') | |
117 | ||
118 | audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) | |
119 | audio_info = self._download_json(audio_info_file, audio_id, | |
120 | 'Downloading info json %s' % audio_info_file, | |
121 | 'Unable to download info file') | |
122 | ||
123 | formats = [] | |
124 | for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): | |
125 | if audio_info.get(k): | |
126 | formats.append({ | |
127 | 'format_id': bps, | |
128 | 'url': audio_info[k], | |
129 | }) | |
130 | ||
131 | thumbnails = [] | |
132 | for k in audio_info.keys(): | |
133 | # cover pics kyes like: cover_url', 'cover_url_142' | |
134 | if k.startswith('cover_url'): | |
135 | thumbnail = {'name': k, 'url': audio_info[k]} | |
136 | if k == 'cover_url_142': | |
137 | thumbnail['width'] = 180 | |
138 | thumbnail['height'] = 180 | |
139 | thumbnails.append(thumbnail) | |
140 | ||
141 | audio_uploader_id = audio_info.get('uid') | |
142 | ||
143 | if is_m: | |
144 | audio_description = self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>', | |
145 | webpage, 'audio_description', fatal=False) | |
146 | else: | |
147 | audio_description = self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)', | |
148 | webpage, 'audio_description', fatal=False) | |
149 | ||
150 | if not audio_description: | |
151 | audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) | |
152 | audio_description = self._download_webpage(audio_description_file, audio_id, | |
153 | note='Downloading description file %s' % audio_description_file, | |
154 | errnote='Unable to download descrip file', | |
155 | fatal=False) | |
156 | audio_description = audio_description.strip() if audio_description else None | |
157 | ||
158 | return { | |
159 | 'id': audio_id, | |
160 | 'uploader': audio_info.get('nickname'), | |
161 | 'uploader_id': audio_uploader_id, | |
162 | 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, | |
163 | 'title': audio_info['title'], | |
164 | 'thumbnails': thumbnails, | |
165 | 'description': audio_description, | |
166 | 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), | |
167 | 'duration': audio_info.get('duration'), | |
168 | 'view_count': audio_info.get('play_count'), | |
169 | 'like_count': audio_info.get('favorites_count'), | |
170 | 'formats': formats, | |
171 | } | |
172 | ||
173 | ||
174 | class XimalayaAlbumIE(XimalayaBaseIE): | |
175 | IE_NAME = 'ximalaya:album' | |
176 | IE_DESC = '喜马拉雅FM 专辑' | |
177 | _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)' | |
178 | _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' | |
179 | _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' | |
180 | _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">' | |
181 | _TESTS = [{ | |
182 | 'url': 'http://www.ximalaya.com/61425525/album/5534601/', | |
183 | 'info_dict': { | |
184 | 'title': '唐诗三百首(含赏析)', | |
185 | 'id': '5534601', | |
186 | }, | |
187 | 'playlist_count': 312, | |
188 | }, { | |
189 | 'url': 'http://m.ximalaya.com/61425525/album/5534601', | |
190 | 'info_dict': { | |
191 | 'title': '唐诗三百首(含赏析)', | |
192 | 'id': '5534601', | |
193 | }, | |
194 | 'playlist_count': 312, | |
195 | }, | |
196 | ] | |
197 | ||
198 | def _real_extract(self, url): | |
199 | self.scheme = scheme = 'https' if url.startswith('https') else 'http' | |
200 | ||
5ad28e7f | 201 | mobj = self._match_valid_url(url) |
a90641fe | 202 | uid, playlist_id = mobj.group('uid'), mobj.group('id') |
203 | ||
204 | webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, | |
205 | note='Download album page for %s' % playlist_id, | |
206 | errnote='Unable to get album info') | |
207 | ||
208 | title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', | |
209 | webpage, 'title', fatal=False) | |
210 | ||
211 | return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) | |
212 | ||
213 | def _entries(self, page, playlist_id, uid): | |
214 | html = page | |
215 | for page_num in itertools.count(1): | |
216 | for entry in self._process_page(html, uid): | |
217 | yield entry | |
218 | ||
219 | next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', | |
220 | html, 'list_next_url', default=None, group='more') | |
221 | if not next_url: | |
222 | break | |
223 | ||
224 | next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) | |
225 | html = self._download_webpage(next_full_url, playlist_id) | |
226 | ||
227 | def _process_page(self, html, uid): | |
228 | find_from = html.index('album_soundlist') | |
229 | for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): | |
230 | yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), | |
231 | XimalayaIE.ie_key(), | |
232 | mobj.group('id'), | |
233 | mobj.group('title')) |