4 from .common
import InfoExtractor
7 class XimalayaBaseIE(InfoExtractor
):
8 _GEO_COUNTRIES
= ['CN']
11 class XimalayaIE(XimalayaBaseIE
):
14 _VALID_URL
= r
'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)'
15 _USER_URL_FORMAT
= '%s://www.ximalaya.com/zhubo/%i/'
18 'url': 'http://www.ximalaya.com/61425525/sound/47740352/',
23 'uploader_id': 61425525,
24 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/',
25 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
26 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。",
30 'url': r
're:^https?://.*\.jpg$',
33 'name': 'cover_url_142',
34 'url': r
're:^https?://.*\.jpg$',
39 'categories': ['renwen', '人文'],
46 'url': 'http://m.ximalaya.com/61425525/sound/47740352/',
51 'uploader_id': 61425525,
52 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/',
53 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
54 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。",
58 'url': r
're:^https?://.*\.jpg$',
61 'name': 'cover_url_142',
62 'url': r
're:^https?://.*\.jpg$',
67 'categories': ['renwen', '人文'],
74 'url': 'https://www.ximalaya.com/11045267/sound/15705996/',
79 'uploader_id': 11045267,
80 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/',
81 'title': 'Lesson 1 Excuse me!',
82 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n"
87 'url': r
're:^https?://.*\.jpg$',
90 'name': 'cover_url_142',
91 'url': r
're:^https?://.*\.jpg$',
96 'categories': ['train', '外语'],
104 def _real_extract(self
, url
):
106 is_m
= 'm.ximalaya' in url
107 scheme
= 'https' if url
.startswith('https') else 'http'
109 audio_id
= self
._match
_id
(url
)
110 webpage
= self
._download
_webpage
(url
, audio_id
,
111 note
='Download sound page for %s' % audio_id
,
112 errnote
='Unable to get sound page')
114 audio_info_file
= '%s://m.ximalaya.com/tracks/%s.json' % (scheme
, audio_id
)
115 audio_info
= self
._download
_json
(audio_info_file
, audio_id
,
116 'Downloading info json %s' % audio_info_file
,
117 'Unable to download info file')
120 for bps
, k
in (('24k', 'play_path_32'), ('64k', 'play_path_64')):
121 if audio_info
.get(k
):
124 'url': audio_info
[k
],
128 for k
in audio_info
.keys():
129 # cover pics kyes like: cover_url', 'cover_url_142'
130 if k
.startswith('cover_url'):
131 thumbnail
= {'name': k, 'url': audio_info[k]}
132 if k
== 'cover_url_142':
133 thumbnail
['width'] = 180
134 thumbnail
['height'] = 180
135 thumbnails
.append(thumbnail
)
137 audio_uploader_id
= audio_info
.get('uid')
140 audio_description
= self
._html
_search
_regex
(r
'(?s)<section\s+class=["\']content
[^
>]+>(.+?
)</section
>',
141 webpage, 'audio_description
', fatal=False)
143 audio_description = self._html_search_regex(r'(?s
)<div\s
+class=["\']rich_intro[^>]*>(.+?</article>)',
144 webpage, 'audio_description', fatal=False)
146 if not audio_description:
147 audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id)
148 audio_description = self._download_webpage(audio_description_file, audio_id,
149 note='Downloading description file %s' % audio_description_file,
150 errnote='Unable to download descrip file',
152 audio_description = audio_description.strip() if audio_description else None
156 'uploader': audio_info.get('nickname'),
157 'uploader_id': audio_uploader_id,
158 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None,
159 'title': audio_info['title'],
160 'thumbnails': thumbnails,
161 'description': audio_description,
162 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))),
163 'duration': audio_info.get('duration'),
164 'view_count': audio_info.get('play_count'),
165 'like_count': audio_info.get('favorites_count'),
170 class XimalayaAlbumIE(XimalayaBaseIE):
171 IE_NAME = 'ximalaya:album'
172 IE_DESC = '喜马拉雅FM 专辑'
173 _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)'
174 _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/'
175 _BASE_URL_TEMPL = '%s://www.ximalaya.com%s'
176 _LIST_VIDEO_RE = r'<a[^>]+?href="(?P
<url
>/%s/sound
/(?P
<id>\d
+)/?
)"[^>]+?title="(?P
<title
>[^
>]+)">'
178 'url': 'http://www.ximalaya.com/61425525/album/5534601/',
180 'title': '唐诗三百首(含赏析)',
183 'playlist_count': 312,
185 'url': 'http://m.ximalaya.com/61425525/album/5534601',
187 'title': '唐诗三百首(含赏析)',
190 'playlist_count': 312,
194 def _real_extract(self, url):
195 self.scheme = scheme = 'https' if url.startswith('https') else 'http'
197 mobj = self._match_valid_url(url)
198 uid, playlist_id = mobj.group('uid'), mobj.group('id')
200 webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id,
201 note='Download album page for %s' % playlist_id,
202 errnote='Unable to get album info')
204 title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>',
205 webpage, 'title', fatal=False)
207 return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
209 def _entries(self, page, playlist_id, uid):
211 for page_num in itertools.count(1):
212 for entry in self._process_page(html, uid):
215 next_url = self._search_regex(r'<a\s+href=(["\'])(?P
<more
>[\S
]+)\
1[^
>]+rel
=(["\'])next\3',
216 html, 'list_next_url', default=None, group='more')
220 next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url)
221 html = self._download_webpage(next_full_url, playlist_id)
223 def _process_page(self, html, uid):
224 find_from = html.index('album_soundlist')
225 for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]):
226 yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')),