]>
Commit | Line | Data |
---|---|---|
1 | # encoding: utf-8 | |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
5 | import itertools | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from ..compat import ( | |
9 | compat_str, | |
10 | compat_urlparse, | |
11 | compat_urllib_parse, | |
12 | ) | |
13 | from ..utils import ( | |
14 | ExtractorError, | |
15 | int_or_none, | |
16 | unified_strdate, | |
17 | ) | |
18 | ||
19 | ||
20 | class SoundcloudIE(InfoExtractor): | |
21 | """Information extractor for soundcloud.com | |
22 | To access the media, the uid of the song and a stream token | |
23 | must be extracted from the page source and the script must make | |
24 | a request to media.soundcloud.com/crossdomain.xml. Then | |
25 | the media can be grabbed by requesting from an url composed | |
26 | of the stream token and uid | |
27 | """ | |
28 | ||
29 | _VALID_URL = r'''(?x)^(?:https?://)? | |
30 | (?:(?:(?:www\.|m\.)?soundcloud\.com/ | |
31 | (?P<uploader>[\w\d-]+)/ | |
32 | (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) | |
33 | (?P<title>[\w\d-]+)/? | |
34 | (?P<token>[^?]+?)?(?:[?].*)?$) | |
35 | |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) | |
36 | (?:/?\?secret_token=(?P<secret_token>[^&]+))?) | |
37 | |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*) | |
38 | ) | |
39 | ''' | |
40 | IE_NAME = 'soundcloud' | |
41 | _TESTS = [ | |
42 | { | |
43 | 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', | |
44 | 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', | |
45 | 'info_dict': { | |
46 | 'id': '62986583', | |
47 | 'ext': 'mp3', | |
48 | 'upload_date': '20121011', | |
49 | 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', | |
50 | 'uploader': 'E.T. ExTerrestrial Music', | |
51 | 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', | |
52 | 'duration': 143, | |
53 | } | |
54 | }, | |
55 | # not streamable song | |
56 | { | |
57 | 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', | |
58 | 'info_dict': { | |
59 | 'id': '47127627', | |
60 | 'ext': 'mp3', | |
61 | 'title': 'Goldrushed', | |
62 | 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', | |
63 | 'uploader': 'The Royal Concept', | |
64 | 'upload_date': '20120521', | |
65 | 'duration': 227, | |
66 | }, | |
67 | 'params': { | |
68 | # rtmp | |
69 | 'skip_download': True, | |
70 | }, | |
71 | }, | |
72 | # private link | |
73 | { | |
74 | 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp', | |
75 | 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', | |
76 | 'info_dict': { | |
77 | 'id': '123998367', | |
78 | 'ext': 'mp3', | |
79 | 'title': 'Youtube - Dl Test Video \'\' Ä↭', | |
80 | 'uploader': 'jaimeMF', | |
81 | 'description': 'test chars: \"\'/\\ä↭', | |
82 | 'upload_date': '20131209', | |
83 | 'duration': 9, | |
84 | }, | |
85 | }, | |
86 | # private link (alt format) | |
87 | { | |
88 | 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', | |
89 | 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', | |
90 | 'info_dict': { | |
91 | 'id': '123998367', | |
92 | 'ext': 'mp3', | |
93 | 'title': 'Youtube - Dl Test Video \'\' Ä↭', | |
94 | 'uploader': 'jaimeMF', | |
95 | 'description': 'test chars: \"\'/\\ä↭', | |
96 | 'upload_date': '20131209', | |
97 | 'duration': 9, | |
98 | }, | |
99 | }, | |
100 | # downloadable song | |
101 | { | |
102 | 'url': 'https://soundcloud.com/oddsamples/bus-brakes', | |
103 | 'md5': '7624f2351f8a3b2e7cd51522496e7631', | |
104 | 'info_dict': { | |
105 | 'id': '128590877', | |
106 | 'ext': 'mp3', | |
107 | 'title': 'Bus Brakes', | |
108 | 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', | |
109 | 'uploader': 'oddsamples', | |
110 | 'upload_date': '20140109', | |
111 | 'duration': 17, | |
112 | }, | |
113 | }, | |
114 | ] | |
115 | ||
116 | _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' | |
117 | _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' | |
118 | ||
119 | def report_resolve(self, video_id): | |
120 | """Report information extraction.""" | |
121 | self.to_screen('%s: Resolving id' % video_id) | |
122 | ||
123 | @classmethod | |
124 | def _resolv_url(cls, url): | |
125 | return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID | |
126 | ||
127 | def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): | |
128 | track_id = compat_str(info['id']) | |
129 | name = full_title or track_id | |
130 | if quiet: | |
131 | self.report_extraction(name) | |
132 | ||
133 | thumbnail = info['artwork_url'] | |
134 | if thumbnail is not None: | |
135 | thumbnail = thumbnail.replace('-large', '-t500x500') | |
136 | ext = 'mp3' | |
137 | result = { | |
138 | 'id': track_id, | |
139 | 'uploader': info['user']['username'], | |
140 | 'upload_date': unified_strdate(info['created_at']), | |
141 | 'title': info['title'], | |
142 | 'description': info['description'], | |
143 | 'thumbnail': thumbnail, | |
144 | 'duration': int_or_none(info.get('duration'), 1000), | |
145 | 'webpage_url': info.get('permalink_url'), | |
146 | } | |
147 | formats = [] | |
148 | if info.get('downloadable', False): | |
149 | # We can build a direct link to the song | |
150 | format_url = ( | |
151 | 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( | |
152 | track_id, self._CLIENT_ID)) | |
153 | formats.append({ | |
154 | 'format_id': 'download', | |
155 | 'ext': info.get('original_format', 'mp3'), | |
156 | 'url': format_url, | |
157 | 'vcodec': 'none', | |
158 | 'preference': 10, | |
159 | }) | |
160 | ||
161 | # We have to retrieve the url | |
162 | streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' | |
163 | 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) | |
164 | format_dict = self._download_json( | |
165 | streams_url, | |
166 | track_id, 'Downloading track url') | |
167 | ||
168 | for key, stream_url in format_dict.items(): | |
169 | if key.startswith('http'): | |
170 | formats.append({ | |
171 | 'format_id': key, | |
172 | 'ext': ext, | |
173 | 'url': stream_url, | |
174 | 'vcodec': 'none', | |
175 | }) | |
176 | elif key.startswith('rtmp'): | |
177 | # The url doesn't have an rtmp app, we have to extract the playpath | |
178 | url, path = stream_url.split('mp3:', 1) | |
179 | formats.append({ | |
180 | 'format_id': key, | |
181 | 'url': url, | |
182 | 'play_path': 'mp3:' + path, | |
183 | 'ext': 'flv', | |
184 | 'vcodec': 'none', | |
185 | }) | |
186 | ||
187 | if not formats: | |
188 | # We fallback to the stream_url in the original info, this | |
189 | # cannot be always used, sometimes it can give an HTTP 404 error | |
190 | formats.append({ | |
191 | 'format_id': 'fallback', | |
192 | 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, | |
193 | 'ext': ext, | |
194 | 'vcodec': 'none', | |
195 | }) | |
196 | ||
197 | for f in formats: | |
198 | if f['format_id'].startswith('http'): | |
199 | f['protocol'] = 'http' | |
200 | if f['format_id'].startswith('rtmp'): | |
201 | f['protocol'] = 'rtmp' | |
202 | ||
203 | self._check_formats(formats, track_id) | |
204 | self._sort_formats(formats) | |
205 | result['formats'] = formats | |
206 | ||
207 | return result | |
208 | ||
209 | def _real_extract(self, url): | |
210 | mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) | |
211 | if mobj is None: | |
212 | raise ExtractorError('Invalid URL: %s' % url) | |
213 | ||
214 | track_id = mobj.group('track_id') | |
215 | token = None | |
216 | if track_id is not None: | |
217 | info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID | |
218 | full_title = track_id | |
219 | token = mobj.group('secret_token') | |
220 | if token: | |
221 | info_json_url += "&secret_token=" + token | |
222 | elif mobj.group('player'): | |
223 | query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) | |
224 | real_url = query['url'][0] | |
225 | # If the token is in the query of the original url we have to | |
226 | # manually add it | |
227 | if 'secret_token' in query: | |
228 | real_url += '?secret_token=' + query['secret_token'][0] | |
229 | return self.url_result(real_url) | |
230 | else: | |
231 | # extract uploader (which is in the url) | |
232 | uploader = mobj.group('uploader') | |
233 | # extract simple title (uploader + slug of song title) | |
234 | slug_title = mobj.group('title') | |
235 | token = mobj.group('token') | |
236 | full_title = resolve_title = '%s/%s' % (uploader, slug_title) | |
237 | if token: | |
238 | resolve_title += '/%s' % token | |
239 | ||
240 | self.report_resolve(full_title) | |
241 | ||
242 | url = 'http://soundcloud.com/%s' % resolve_title | |
243 | info_json_url = self._resolv_url(url) | |
244 | info = self._download_json(info_json_url, full_title, 'Downloading info JSON') | |
245 | ||
246 | return self._extract_info_dict(info, full_title, secret_token=token) | |
247 | ||
248 | ||
249 | class SoundcloudSetIE(SoundcloudIE): | |
250 | _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' | |
251 | IE_NAME = 'soundcloud:set' | |
252 | _TESTS = [{ | |
253 | 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', | |
254 | 'info_dict': { | |
255 | 'id': '2284613', | |
256 | 'title': 'The Royal Concept EP', | |
257 | }, | |
258 | 'playlist_mincount': 6, | |
259 | }] | |
260 | ||
261 | def _real_extract(self, url): | |
262 | mobj = re.match(self._VALID_URL, url) | |
263 | ||
264 | # extract uploader (which is in the url) | |
265 | uploader = mobj.group('uploader') | |
266 | # extract simple title (uploader + slug of song title) | |
267 | slug_title = mobj.group('slug_title') | |
268 | full_title = '%s/sets/%s' % (uploader, slug_title) | |
269 | url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) | |
270 | ||
271 | token = mobj.group('token') | |
272 | if token: | |
273 | full_title += '/' + token | |
274 | url += '/' + token | |
275 | ||
276 | self.report_resolve(full_title) | |
277 | ||
278 | resolv_url = self._resolv_url(url) | |
279 | info = self._download_json(resolv_url, full_title) | |
280 | ||
281 | if 'errors' in info: | |
282 | msgs = (compat_str(err['error_message']) for err in info['errors']) | |
283 | raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) | |
284 | ||
285 | entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] | |
286 | ||
287 | return { | |
288 | '_type': 'playlist', | |
289 | 'entries': entries, | |
290 | 'id': '%s' % info['id'], | |
291 | 'title': info['title'], | |
292 | } | |
293 | ||
294 | ||
295 | class SoundcloudUserIE(SoundcloudIE): | |
296 | _VALID_URL = r'''(?x) | |
297 | https?:// | |
298 | (?:(?:www|m)\.)?soundcloud\.com/ | |
299 | (?P<user>[^/]+) | |
300 | (?:/ | |
301 | (?P<rsrc>tracks|sets|reposts|likes|spotlight) | |
302 | )? | |
303 | /?(?:[?#].*)?$ | |
304 | ''' | |
305 | IE_NAME = 'soundcloud:user' | |
306 | _TESTS = [{ | |
307 | 'url': 'https://soundcloud.com/the-akashic-chronicler', | |
308 | 'info_dict': { | |
309 | 'id': '114582580', | |
310 | 'title': 'The Akashic Chronicler (All)', | |
311 | }, | |
312 | 'playlist_mincount': 112, | |
313 | }, { | |
314 | 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', | |
315 | 'info_dict': { | |
316 | 'id': '114582580', | |
317 | 'title': 'The Akashic Chronicler (Tracks)', | |
318 | }, | |
319 | 'playlist_mincount': 50, | |
320 | }, { | |
321 | 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', | |
322 | 'info_dict': { | |
323 | 'id': '114582580', | |
324 | 'title': 'The Akashic Chronicler (Playlists)', | |
325 | }, | |
326 | 'playlist_mincount': 3, | |
327 | }, { | |
328 | 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', | |
329 | 'info_dict': { | |
330 | 'id': '114582580', | |
331 | 'title': 'The Akashic Chronicler (Reposts)', | |
332 | }, | |
333 | 'playlist_mincount': 9, | |
334 | }, { | |
335 | 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', | |
336 | 'info_dict': { | |
337 | 'id': '114582580', | |
338 | 'title': 'The Akashic Chronicler (Likes)', | |
339 | }, | |
340 | 'playlist_mincount': 333, | |
341 | }, { | |
342 | 'url': 'https://soundcloud.com/grynpyret/spotlight', | |
343 | 'info_dict': { | |
344 | 'id': '7098329', | |
345 | 'title': 'Grynpyret (Spotlight)', | |
346 | }, | |
347 | 'playlist_mincount': 1, | |
348 | }] | |
349 | ||
350 | _API_BASE = 'https://api.soundcloud.com' | |
351 | _API_V2_BASE = 'https://api-v2.soundcloud.com' | |
352 | ||
353 | _BASE_URL_MAP = { | |
354 | 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, | |
355 | 'tracks': '%s/users/%%s/tracks' % _API_BASE, | |
356 | 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, | |
357 | 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, | |
358 | 'likes': '%s/users/%%s/likes' % _API_V2_BASE, | |
359 | 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, | |
360 | } | |
361 | ||
362 | _TITLE_MAP = { | |
363 | 'all': 'All', | |
364 | 'tracks': 'Tracks', | |
365 | 'sets': 'Playlists', | |
366 | 'reposts': 'Reposts', | |
367 | 'likes': 'Likes', | |
368 | 'spotlight': 'Spotlight', | |
369 | } | |
370 | ||
371 | def _real_extract(self, url): | |
372 | mobj = re.match(self._VALID_URL, url) | |
373 | uploader = mobj.group('user') | |
374 | ||
375 | url = 'http://soundcloud.com/%s/' % uploader | |
376 | resolv_url = self._resolv_url(url) | |
377 | user = self._download_json( | |
378 | resolv_url, uploader, 'Downloading user info') | |
379 | ||
380 | resource = mobj.group('rsrc') or 'all' | |
381 | base_url = self._BASE_URL_MAP[resource] % user['id'] | |
382 | ||
383 | next_href = None | |
384 | ||
385 | entries = [] | |
386 | for i in itertools.count(): | |
387 | if not next_href: | |
388 | data = compat_urllib_parse.urlencode({ | |
389 | 'offset': i * 50, | |
390 | 'limit': 50, | |
391 | 'client_id': self._CLIENT_ID, | |
392 | 'linked_partitioning': '1', | |
393 | 'representation': 'speedy', | |
394 | }) | |
395 | next_href = base_url + '?' + data | |
396 | ||
397 | response = self._download_json( | |
398 | next_href, uploader, 'Downloading track page %s' % (i + 1)) | |
399 | ||
400 | collection = response['collection'] | |
401 | ||
402 | if not collection: | |
403 | self.to_screen('%s: End page received' % uploader) | |
404 | break | |
405 | ||
406 | def resolve_permalink_url(candidates): | |
407 | for cand in candidates: | |
408 | if isinstance(cand, dict): | |
409 | permalink_url = cand.get('permalink_url') | |
410 | if permalink_url and permalink_url.startswith('http'): | |
411 | return permalink_url | |
412 | ||
413 | for e in collection: | |
414 | permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) | |
415 | if permalink_url: | |
416 | entries.append(self.url_result(permalink_url)) | |
417 | ||
418 | if 'next_href' in response: | |
419 | next_href = response['next_href'] | |
420 | if not next_href: | |
421 | break | |
422 | else: | |
423 | next_href = None | |
424 | ||
425 | return { | |
426 | '_type': 'playlist', | |
427 | 'id': compat_str(user['id']), | |
428 | 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), | |
429 | 'entries': entries, | |
430 | } | |
431 | ||
432 | ||
433 | class SoundcloudPlaylistIE(SoundcloudIE): | |
434 | _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' | |
435 | IE_NAME = 'soundcloud:playlist' | |
436 | _TESTS = [{ | |
437 | 'url': 'http://api.soundcloud.com/playlists/4110309', | |
438 | 'info_dict': { | |
439 | 'id': '4110309', | |
440 | 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', | |
441 | 'description': 're:.*?TILT Brass - Bowery Poetry Club', | |
442 | }, | |
443 | 'playlist_count': 6, | |
444 | }] | |
445 | ||
446 | def _real_extract(self, url): | |
447 | mobj = re.match(self._VALID_URL, url) | |
448 | playlist_id = mobj.group('id') | |
449 | base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id) | |
450 | ||
451 | data_dict = { | |
452 | 'client_id': self._CLIENT_ID, | |
453 | } | |
454 | token = mobj.group('token') | |
455 | ||
456 | if token: | |
457 | data_dict['secret_token'] = token | |
458 | ||
459 | data = compat_urllib_parse.urlencode(data_dict) | |
460 | data = self._download_json( | |
461 | base_url + data, playlist_id, 'Downloading playlist') | |
462 | ||
463 | entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] | |
464 | ||
465 | return { | |
466 | '_type': 'playlist', | |
467 | 'id': playlist_id, | |
468 | 'title': data.get('title'), | |
469 | 'description': data.get('description'), | |
470 | 'entries': entries, | |
471 | } |