]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/soundcloud.py
[ie/tiktok] Fix subtitles extraction (#9961)
[yt-dlp.git] / yt_dlp / extractor / soundcloud.py
CommitLineData
92790f4e 1import itertools
2ab47fa3 2import json
97362712 3import re
aad0d6d5 4
97362712 5from .common import InfoExtractor, SearchInfoExtractor
3d2623a8 6from ..compat import compat_str
97362712 7from ..networking import HEADRequest
3d2623a8 8from ..networking.exceptions import HTTPError
1cc79574 9from ..utils import (
97362712 10 KNOWN_EXTENSIONS,
aad0d6d5 11 ExtractorError,
97362712 12 error_to_compat_str,
e09965d5 13 float_or_none,
eb920777 14 int_or_none,
e09965d5 15 mimetype2ext,
aa6c2530 16 parse_qs,
e09965d5 17 str_or_none,
97362712 18 try_call,
f516f440 19 unified_timestamp,
d7c7100e 20 update_url_query,
7c5307f4 21 url_or_none,
a6c5859d 22 urlhandle_detect_ext,
d7c7100e 23)
246571ae 24from ..utils.traversal import traverse_obj
aad0d6d5
PH
25
26
548c3957 27class SoundcloudEmbedIE(InfoExtractor):
cf80ff18 28 _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)'
bfd973ec 29 _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1']
cf80ff18
RA
30 _TEST = {
31 # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/
32 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey',
33 'only_matching': True,
34 }
548c3957 35
548c3957 36 def _real_extract(self, url):
aa6c2530 37 query = parse_qs(url)
cf80ff18
RA
38 api_url = query['url'][0]
39 secret_token = query.get('secret_token')
40 if secret_token:
41 api_url = update_url_query(api_url, {'secret_token': secret_token[0]})
42 return self.url_result(api_url)
548c3957
RA
43
44
63ccf4ff 45class SoundcloudBaseIE(InfoExtractor):
08d30158 46 _NETRC_MACHINE = 'soundcloud'
47
63ccf4ff 48 _API_V2_BASE = 'https://api-v2.soundcloud.com/'
49 _BASE_URL = 'https://soundcloud.com/'
08d30158 50 _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
51 _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
52 _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
53 _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
08d30158 54 _HEADERS = {}
63ccf4ff 55
c04cc2e2
KAW
56 _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
57
58 _ARTWORK_MAP = {
59 'mini': 16,
60 'tiny': 20,
61 'small': 32,
62 'badge': 47,
63 't67x67': 67,
64 'large': 100,
65 't300x300': 300,
66 'crop': 400,
67 't500x500': 500,
68 'original': 0,
69 }
70
63ccf4ff 71 def _store_client_id(self, client_id):
9809740b 72 self.cache.store('soundcloud', 'client_id', client_id)
63ccf4ff 73
74 def _update_client_id(self):
75 webpage = self._download_webpage('https://soundcloud.com/', None)
76 for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
77 script = self._download_webpage(src, None, fatal=False)
78 if script:
79 client_id = self._search_regex(
80 r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
81 script, 'client id', default=None)
82 if client_id:
83 self._CLIENT_ID = client_id
84 self._store_client_id(client_id)
85 return
86 raise ExtractorError('Unable to extract client id')
87
88 def _download_json(self, *args, **kwargs):
89 non_fatal = kwargs.get('fatal') is False
90 if non_fatal:
91 del kwargs['fatal']
92 query = kwargs.get('query', {}).copy()
93 for _ in range(2):
94 query['client_id'] = self._CLIENT_ID
95 kwargs['query'] = query
96 try:
f9934b96 97 return super()._download_json(*args, **kwargs)
63ccf4ff 98 except ExtractorError as e:
3d2623a8 99 if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
63ccf4ff 100 self._store_client_id(None)
101 self._update_client_id()
102 continue
103 elif non_fatal:
104 self.report_warning(error_to_compat_str(e))
105 return False
106 raise
107
52efa4b3 108 def _initialize_pre_login(self):
9809740b 109 self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
63ccf4ff 110
97362712 111 def _verify_oauth_token(self, token):
112 if self._request_webpage(
113 self._API_VERIFY_AUTH_TOKEN % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
114 None, note='Verifying login token...', fatal=False,
115 data=json.dumps({'session': {'access_token': token}}).encode()):
116 self._HEADERS['Authorization'] = f'OAuth {token}'
117 self.report_login()
118 else:
119 self.report_warning('Provided authorization token is invalid. Continuing as guest')
120
121 def _real_initialize(self):
122 if self._HEADERS:
123 return
124 if token := try_call(lambda: self._get_cookies(self._BASE_URL)['oauth_token'].value):
125 self._verify_oauth_token(token)
126
52efa4b3 127 def _perform_login(self, username, password):
128 if username != 'oauth':
97362712 129 raise ExtractorError(
63ccf4ff 130 'Login using username and password is not currently supported. '
97362712 131 'Use "--username oauth --password <oauth_token>" to login using an oauth token, '
132 f'or else {self._login_hint(method="cookies")}', expected=True)
133 if self._HEADERS:
134 return
135 self._verify_oauth_token(password)
63ccf4ff 136
137 r'''
138 def genDevId():
139 def genNumBlock():
140 return ''.join([str(random.randrange(10)) for i in range(6)])
141 return '-'.join([genNumBlock() for i in range(4)])
142
143 payload = {
144 'client_id': self._CLIENT_ID,
145 'recaptcha_pubkey': 'null',
146 'recaptcha_response': 'null',
147 'credentials': {
148 'identifier': username,
149 'password': password
150 },
151 'signature': self.sign(username, password, self._CLIENT_ID),
152 'device_id': genDevId(),
153 'user_agent': self._USER_AGENT
154 }
155
97362712 156 response = self._download_json(
157 self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
158 None, note='Verifying login token...', fatal=False,
159 data=json.dumps(payload).encode())
160
161 if token := traverse_obj(response, ('session', 'access_token', {str})):
162 self._HEADERS['Authorization'] = f'OAuth {token}'
163 self.report_login()
164 return
165
166 raise ExtractorError('Unable to get access token, login may have failed', expected=True)
63ccf4ff 167 '''
168
169 # signature generation
170 def sign(self, user, pw, clid):
171 a = 33
172 i = 1
173 s = 440123
174 w = 117
175 u = 1800000
176 l = 1042
177 b = 37
178 k = 37
179 c = 5
180 n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
181 y = '8' # _REV
182 r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
183 e = user # _USERNAME
184 t = clid # _CLIENT_ID
185
186 d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
187 p = n + y + d + r + e + t + d + n
188 h = p
189
190 m = 8011470
191 f = 0
192
193 for f in range(f, len(h)):
194 m = (m >> 1) + ((1 & m) << 23)
195 m += ord(h[f])
196 m &= 16777215
197
198 # c is not even needed
199 out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
200
201 return out
202
c04cc2e2
KAW
203 def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False):
204 track_id = compat_str(info['id'])
205 title = info['title']
206
207 format_urls = set()
208 formats = []
209 query = {'client_id': self._CLIENT_ID}
210 if secret_token:
211 query['secret_token'] = secret_token
212
213 if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'):
214 download_url = update_url_query(
215 self._API_V2_BASE + 'tracks/' + track_id + '/download', query)
216 redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri')
217 if redirect_url:
218 urlh = self._request_webpage(
219 HEADRequest(redirect_url), track_id, fatal=False)
220 if urlh:
3d2623a8 221 format_url = urlh.url
c04cc2e2
KAW
222 format_urls.add(format_url)
223 formats.append({
224 'format_id': 'download',
225 'ext': urlhandle_detect_ext(urlh) or 'mp3',
226 'filesize': int_or_none(urlh.headers.get('Content-Length')),
227 'url': format_url,
228 'quality': 10,
a2d08407 229 'format_note': 'Original',
c04cc2e2
KAW
230 })
231
232 def invalid_url(url):
233 return not url or url in format_urls
234
235 def add_format(f, protocol, is_preview=False):
236 mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url)
237 if mobj:
238 for k, v in mobj.groupdict().items():
239 if not f.get(k):
240 f[k] = v
241 format_id_list = []
242 if protocol:
243 format_id_list.append(protocol)
244 ext = f.get('ext')
245 if ext == 'aac':
a2d08407 246 f.update({
247 'abr': 256,
248 'quality': 5,
249 'format_note': 'Premium',
250 })
c04cc2e2 251 for k in ('ext', 'abr'):
a2d08407 252 v = str_or_none(f.get(k))
c04cc2e2
KAW
253 if v:
254 format_id_list.append(v)
255 preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url'])
256 if preview:
257 format_id_list.append('preview')
258 abr = f.get('abr')
259 if abr:
260 f['abr'] = int(abr)
261 if protocol == 'hls':
262 protocol = 'm3u8' if ext == 'aac' else 'm3u8_native'
263 else:
264 protocol = 'http'
265 f.update({
266 'format_id': '_'.join(format_id_list),
267 'protocol': protocol,
268 'preference': -10 if preview else None,
269 })
270 formats.append(f)
271
272 # New API
246571ae 273 for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']))):
274 if extract_flat:
275 break
276 format_url = t['url']
277 stream = None
278
279 for retry in self.RetryManager(fatal=False):
280 try:
281 stream = self._download_json(format_url, track_id, query=query, headers=self._HEADERS)
282 except ExtractorError as e:
283 if isinstance(e.cause, HTTPError) and e.cause.status == 429:
284 self.report_warning(
285 'You have reached the API rate limit, which is ~600 requests per '
286 '10 minutes. Use the --extractor-retries and --retry-sleep options '
287 'to configure an appropriate retry count and wait time', only_once=True)
288 retry.error = e.cause
289 else:
290 self.report_warning(e.msg)
291
c04cc2e2
KAW
292 if not isinstance(stream, dict):
293 continue
294 stream_url = url_or_none(stream.get('url'))
295 if invalid_url(stream_url):
296 continue
297 format_urls.add(stream_url)
298 stream_format = t.get('format') or {}
299 protocol = stream_format.get('protocol')
300 if protocol != 'hls' and '/hls' in format_url:
301 protocol = 'hls'
302 ext = None
303 preset = str_or_none(t.get('preset'))
304 if preset:
305 ext = preset.split('_')[0]
306 if ext not in KNOWN_EXTENSIONS:
307 ext = mimetype2ext(stream_format.get('mime_type'))
308 add_format({
309 'url': stream_url,
310 'ext': ext,
311 }, 'http' if protocol == 'progressive' else protocol,
312 t.get('snipped') or '/preview/' in format_url)
313
314 for f in formats:
315 f['vcodec'] = 'none'
316
317 if not formats and info.get('policy') == 'BLOCK':
318 self.raise_geo_restricted(metadata_available=True)
c04cc2e2
KAW
319
320 user = info.get('user') or {}
321
322 thumbnails = []
323 artwork_url = info.get('artwork_url')
324 thumbnail = artwork_url or user.get('avatar_url')
325 if isinstance(thumbnail, compat_str):
326 if re.search(self._IMAGE_REPL_RE, thumbnail):
327 for image_id, size in self._ARTWORK_MAP.items():
328 i = {
329 'id': image_id,
330 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail),
331 }
332 if image_id == 'tiny' and not artwork_url:
333 size = 18
334 elif image_id == 'original':
335 i['preference'] = 10
336 if size:
337 i.update({
338 'width': size,
339 'height': size,
340 })
341 thumbnails.append(i)
342 else:
343 thumbnails = [{'url': thumbnail}]
344
345 def extract_count(key):
346 return int_or_none(info.get('%s_count' % key))
347
348 return {
349 'id': track_id,
350 'uploader': user.get('username'),
351 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
352 'uploader_url': user.get('permalink_url'),
353 'timestamp': unified_timestamp(info.get('created_at')),
354 'title': title,
355 'description': info.get('description'),
356 'thumbnails': thumbnails,
357 'duration': float_or_none(info.get('duration'), 1000),
358 'webpage_url': info.get('permalink_url'),
359 'license': info.get('license'),
360 'view_count': extract_count('playback'),
361 'like_count': extract_count('favoritings') or extract_count('likes'),
362 'comment_count': extract_count('comment'),
363 'repost_count': extract_count('reposts'),
231c2eac 364 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)),
c04cc2e2
KAW
365 'formats': formats if not extract_flat else None
366 }
367
63ccf4ff 368 @classmethod
369 def _resolv_url(cls, url):
370 return cls._API_V2_BASE + 'resolve?url=' + url
371
372
373class SoundcloudIE(SoundcloudBaseIE):
aad0d6d5
PH
374 """Information extractor for soundcloud.com
375 To access the media, the uid of the song and a stream token
376 must be extracted from the page source and the script must make
377 a request to media.soundcloud.com/crossdomain.xml. Then
378 the media can be grabbed by requesting from an url composed
379 of the stream token and uid
380 """
381
20991253 382 _VALID_URL = r'''(?x)^(?:https?://)?
71507a11 383 (?:(?:(?:www\.|m\.)?soundcloud\.com/
836ef264 384 (?!stations/track)
4ff50ef8 385 (?P<uploader>[\w\d-]+)/
3ef2da2d 386 (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
c2dedf12
L
387 (?P<title>[\w\d-]+)
388 (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))?
389 (?:[?].*)?$)
548c3957 390 |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
0403b069 391 (?:/?\?secret_token=(?P<secret_token>[^&]+))?)
eb6a41ba
JMF
392 )
393 '''
fbcd7b5f 394 IE_NAME = 'soundcloud'
12c167c8
JMF
395 _TESTS = [
396 {
fbcd7b5f 397 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
231c2eac 398 'md5': 'de9bac153e7427a7333b4b0c1b6a18d2',
fbcd7b5f 399 'info_dict': {
0eb9fb9f 400 'id': '62986583',
231c2eac 401 'ext': 'opus',
f516f440 402 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
0eb9fb9f
JMF
403 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
404 'uploader': 'E.T. ExTerrestrial Music',
548c3957 405 'uploader_id': '1571244',
f516f440
S
406 'timestamp': 1349920598,
407 'upload_date': '20121011',
e09965d5 408 'duration': 143.216,
4bfd294e 409 'license': 'all-rights-reserved',
f516f440
S
410 'view_count': int,
411 'like_count': int,
412 'comment_count': int,
413 'repost_count': int,
231c2eac 414 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg',
415 'uploader_url': 'https://soundcloud.com/ethmusic',
416 'genres': [],
12c167c8
JMF
417 }
418 },
a6c5859d 419 # geo-restricted
12c167c8 420 {
fbcd7b5f
PH
421 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
422 'info_dict': {
423 'id': '47127627',
231c2eac 424 'ext': 'opus',
fbcd7b5f 425 'title': 'Goldrushed',
63ad0315 426 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
fbcd7b5f 427 'uploader': 'The Royal Concept',
548c3957 428 'uploader_id': '9615865',
f516f440 429 'timestamp': 1337635207,
fbcd7b5f 430 'upload_date': '20120521',
a6c5859d 431 'duration': 227.155,
4bfd294e 432 'license': 'all-rights-reserved',
f516f440
S
433 'view_count': int,
434 'like_count': int,
435 'comment_count': int,
436 'repost_count': int,
231c2eac 437 'uploader_url': 'https://soundcloud.com/the-concept-band',
438 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg',
439 'genres': ['Alternative'],
12c167c8 440 },
12c167c8 441 },
de2dd4c5
JMF
442 # private link
443 {
7a5c1cfe 444 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
fbcd7b5f
PH
445 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
446 'info_dict': {
447 'id': '123998367',
448 'ext': 'mp3',
449 'title': 'Youtube - Dl Test Video \'\' Ä↭',
fbcd7b5f 450 'description': 'test chars: \"\'/\\ä↭',
f516f440 451 'uploader': 'jaimeMF',
548c3957 452 'uploader_id': '69767071',
f516f440 453 'timestamp': 1386604920,
fbcd7b5f 454 'upload_date': '20131209',
e09965d5 455 'duration': 9.927,
4bfd294e 456 'license': 'all-rights-reserved',
f516f440
S
457 'view_count': int,
458 'like_count': int,
459 'comment_count': int,
460 'repost_count': int,
231c2eac 461 'uploader_url': 'https://soundcloud.com/jaimemf',
462 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
463 'genres': ['youtubedl'],
de2dd4c5
JMF
464 },
465 },
9296738f 466 # private link (alt format)
467 {
468 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp',
469 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
470 'info_dict': {
471 'id': '123998367',
472 'ext': 'mp3',
473 'title': 'Youtube - Dl Test Video \'\' Ä↭',
9296738f 474 'description': 'test chars: \"\'/\\ä↭',
f516f440 475 'uploader': 'jaimeMF',
548c3957 476 'uploader_id': '69767071',
f516f440 477 'timestamp': 1386604920,
9296738f 478 'upload_date': '20131209',
e09965d5 479 'duration': 9.927,
4bfd294e 480 'license': 'all-rights-reserved',
f516f440
S
481 'view_count': int,
482 'like_count': int,
483 'comment_count': int,
484 'repost_count': int,
231c2eac 485 'uploader_url': 'https://soundcloud.com/jaimemf',
486 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
487 'genres': ['youtubedl'],
9296738f 488 },
489 },
f67ca84d
JMF
490 # downloadable song
491 {
be05d5cf
TOH
492 'url': 'https://soundcloud.com/the80m/the-following',
493 'md5': '9ffcddb08c87d74fb5808a3c183a1d04',
fbcd7b5f 494 'info_dict': {
be05d5cf
TOH
495 'id': '343609555',
496 'ext': 'wav',
231c2eac 497 'title': 'The Following',
498 'description': '',
499 'uploader': '80M',
500 'uploader_id': '312384765',
501 'uploader_url': 'https://soundcloud.com/the80m',
502 'upload_date': '20170922',
503 'timestamp': 1506120436,
504 'duration': 397.228,
505 'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg',
506 'license': 'all-rights-reserved',
507 'like_count': int,
508 'comment_count': int,
509 'repost_count': int,
510 'view_count': int,
511 'genres': ['Dance & EDM'],
f67ca84d
JMF
512 },
513 },
d7c7100e
S
514 # private link, downloadable format
515 {
516 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd',
517 'md5': '64a60b16e617d41d0bef032b7f55441e',
518 'info_dict': {
519 'id': '340344461',
520 'ext': 'wav',
521 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
522 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
523 'uploader': 'Ori Uplift Music',
548c3957 524 'uploader_id': '12563093',
f516f440 525 'timestamp': 1504206263,
d7c7100e 526 'upload_date': '20170831',
e09965d5 527 'duration': 7449.096,
d7c7100e 528 'license': 'all-rights-reserved',
f516f440
S
529 'view_count': int,
530 'like_count': int,
531 'comment_count': int,
532 'repost_count': int,
231c2eac 533 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg',
534 'uploader_url': 'https://soundcloud.com/oriuplift',
535 'genres': ['Trance'],
d7c7100e
S
536 },
537 },
0b0870f9
PV
538 # no album art, use avatar pic for thumbnail
539 {
540 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real',
541 'md5': '59c7872bc44e5d99b7211891664760c2',
542 'info_dict': {
543 'id': '309699954',
544 'ext': 'mp3',
545 'title': 'Sideways (Prod. Mad Real)',
546 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
547 'uploader': 'garyvee',
548c3957 548 'uploader_id': '2366352',
f516f440 549 'timestamp': 1488152409,
0b0870f9 550 'upload_date': '20170226',
e09965d5 551 'duration': 207.012,
0b0870f9
PV
552 'thumbnail': r're:https?://.*\.jpg',
553 'license': 'all-rights-reserved',
f516f440
S
554 'view_count': int,
555 'like_count': int,
556 'comment_count': int,
557 'repost_count': int,
231c2eac 558 'uploader_url': 'https://soundcloud.com/garyvee',
559 'genres': [],
0b0870f9
PV
560 },
561 'params': {
562 'skip_download': True,
563 },
564 },
e09965d5
S
565 {
566 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
231c2eac 567 'md5': '8227c3473a4264df6b02ad7e5b7527ac',
e09965d5
S
568 'info_dict': {
569 'id': '583011102',
231c2eac 570 'ext': 'opus',
e09965d5 571 'title': 'Mezzo Valzer',
231c2eac 572 'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a',
573 'uploader': 'Giovanni Sarani',
548c3957 574 'uploader_id': '3352531',
e09965d5
S
575 'timestamp': 1551394171,
576 'upload_date': '20190228',
577 'duration': 180.157,
578 'thumbnail': r're:https?://.*\.jpg',
579 'license': 'all-rights-reserved',
580 'view_count': int,
581 'like_count': int,
582 'comment_count': int,
583 'repost_count': int,
231c2eac 584 'genres': ['Piano'],
585 'uploader_url': 'https://soundcloud.com/giovannisarani',
e09965d5 586 },
75294a5e
S
587 },
588 {
bc842c27 589 # AAC HQ format available (account with active subscription needed)
75294a5e
S
590 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1',
591 'only_matching': True,
592 },
bc842c27
U
593 {
594 # Go+ (account with active subscription needed)
595 'url': 'https://soundcloud.com/taylorswiftofficial/look-what-you-made-me-do',
596 'only_matching': True,
597 },
12c167c8 598 ]
aad0d6d5 599
aad0d6d5 600 def _real_extract(self, url):
5ad28e7f 601 mobj = self._match_valid_url(url)
aad0d6d5 602
eb6a41ba 603 track_id = mobj.group('track_id')
4bfd294e 604
3bed6217 605 query = {}
548c3957
RA
606 if track_id:
607 info_json_url = self._API_V2_BASE + 'tracks/' + track_id
eb6a41ba 608 full_title = track_id
9296738f 609 token = mobj.group('secret_token')
610 if token:
548c3957 611 query['secret_token'] = token
eb6a41ba 612 else:
548c3957 613 full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title')
de2dd4c5 614 token = mobj.group('token')
de2dd4c5
JMF
615 if token:
616 resolve_title += '/%s' % token
548c3957 617 info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
5f6a1245 618
e09965d5 619 info = self._download_json(
fb4126a1 620 info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
a6c5859d
RA
621
622 return self._extract_info_dict(info, full_title, token)
aad0d6d5 623
20991253 624
63ccf4ff 625class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
2a5c26c9
RA
626 def _extract_set(self, playlist, token=None):
627 playlist_id = compat_str(playlist['id'])
628 tracks = playlist.get('tracks') or []
629 if not all([t.get('permalink_url') for t in tracks]) and token:
630 tracks = self._download_json(
631 self._API_V2_BASE + 'tracks', playlist_id,
632 'Downloading tracks', query={
633 'ids': ','.join([compat_str(t['id']) for t in tracks]),
634 'playlistId': playlist_id,
635 'playlistSecretToken': token,
fb4126a1 636 }, headers=self._HEADERS)
548c3957
RA
637 entries = []
638 for track in tracks:
639 track_id = str_or_none(track.get('id'))
640 url = track.get('permalink_url')
641 if not url:
642 if not track_id:
643 continue
644 url = self._API_V2_BASE + 'tracks/' + track_id
645 if token:
646 url += '?secret_token=' + token
647 entries.append(self.url_result(
648 url, SoundcloudIE.ie_key(), track_id))
2a5c26c9
RA
649 return self.playlist_result(
650 entries, playlist_id,
651 playlist.get('title'),
652 playlist.get('description'))
8e45e1cc
S
653
654
7518a61d 655class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
906f980a 656 _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[:\w\d-]+)(?:/(?P<token>[^?/]+))?'
fbcd7b5f 657 IE_NAME = 'soundcloud:set'
22a6f150
PH
658 _TESTS = [{
659 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
660 'info_dict': {
a9551e90 661 'id': '2284613',
22a6f150 662 'title': 'The Royal Concept EP',
2a5c26c9 663 'description': 'md5:71d07087c7a449e8941a70a29e34671e',
22a6f150 664 },
bf2dc9cc 665 'playlist_mincount': 5,
f7043ef3
S
666 }, {
667 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
668 'only_matching': True,
906f980a
U
669 }, {
670 'url': 'https://soundcloud.com/discover/sets/weekly::flacmatic',
671 'only_matching': True,
672 }, {
673 'url': 'https://soundcloud.com/discover/sets/charts-top:all-music:de',
674 'only_matching': True,
675 }, {
676 'url': 'https://soundcloud.com/discover/sets/charts-top:hiphoprap:kr',
677 'only_matching': True,
22a6f150 678 }]
aad0d6d5 679
aad0d6d5 680 def _real_extract(self, url):
5ad28e7f 681 mobj = self._match_valid_url(url)
aad0d6d5 682
548c3957 683 full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title')
2f834e93 684 token = mobj.group('token')
685 if token:
686 full_title += '/' + token
aad0d6d5 687
548c3957 688 info = self._download_json(self._resolv_url(
fb4126a1 689 self._BASE_URL + full_title), full_title, headers=self._HEADERS)
aad0d6d5 690
aad0d6d5 691 if 'errors' in info:
214e74bf
JMF
692 msgs = (compat_str(err['error_message']) for err in info['errors'])
693 raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
aad0d6d5 694
2a5c26c9 695 return self._extract_set(info, token)
92790f4e
JMF
696
697
63ccf4ff 698class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
836ef264 699 def _extract_playlist(self, base_url, playlist_id, playlist_title):
aa6c2530 700 return {
701 '_type': 'playlist',
702 'id': playlist_id,
703 'title': playlist_title,
704 'entries': self._entries(base_url, playlist_id),
705 }
706
e04a1ff9 707 def _entries(self, url, playlist_id):
a0566bbf 708 # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
aa272535 709 # https://developers.soundcloud.com/blog/offset-pagination-deprecated
aa6c2530 710 query = {
38970916 711 'limit': 200,
836ef264 712 'linked_partitioning': '1',
aa6c2530 713 'offset': 0,
836ef264
S
714 }
715
836ef264 716 for i in itertools.count():
be5c1ae8 717 for retry in self.RetryManager():
e04a1ff9 718 try:
719 response = self._download_json(
720 url, playlist_id, query=query, headers=self._HEADERS,
be5c1ae8 721 note=f'Downloading track page {i + 1}')
e04a1ff9 722 break
723 except ExtractorError as e:
724 # Downloading page may result in intermittent 502 HTTP error
725 # See https://github.com/yt-dlp/yt-dlp/issues/872
3d2623a8 726 if not isinstance(e.cause, HTTPError) or e.cause.status != 502:
e04a1ff9 727 raise
be5c1ae8 728 retry.error = e
729 continue
836ef264 730
aa6c2530 731 def resolve_entry(*candidates):
7c5307f4
S
732 for cand in candidates:
733 if not isinstance(cand, dict):
734 continue
735 permalink_url = url_or_none(cand.get('permalink_url'))
aa6c2530 736 if permalink_url:
737 return self.url_result(
738 permalink_url,
739 SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
740 str_or_none(cand.get('id')), cand.get('title'))
836ef264 741
aa6c2530 742 for e in response['collection'] or []:
743 yield resolve_entry(e, e.get('track'), e.get('playlist'))
836ef264 744
e04a1ff9 745 url = response.get('next_href')
644149af 746 if not url:
747 break
aa6c2530 748 query.pop('offset', None)
836ef264 749
836ef264 750
836ef264 751class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
16a08978
S
752 _VALID_URL = r'''(?x)
753 https?://
754 (?:(?:www|m)\.)?soundcloud\.com/
755 (?P<user>[^/]+)
756 (?:/
3ef2da2d 757 (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight)
16a08978
S
758 )?
759 /?(?:[?#].*)?$
760 '''
fbcd7b5f 761 IE_NAME = 'soundcloud:user'
22a6f150 762 _TESTS = [{
b6423e6c 763 'url': 'https://soundcloud.com/soft-cell-official',
22a6f150 764 'info_dict': {
b6423e6c
S
765 'id': '207965082',
766 'title': 'Soft Cell (All)',
22a6f150 767 },
b6423e6c 768 'playlist_mincount': 28,
22a6f150 769 }, {
b6423e6c 770 'url': 'https://soundcloud.com/soft-cell-official/tracks',
22a6f150 771 'info_dict': {
b6423e6c
S
772 'id': '207965082',
773 'title': 'Soft Cell (Tracks)',
22a6f150 774 },
b6423e6c 775 'playlist_mincount': 27,
03b9c944 776 }, {
b6423e6c
S
777 'url': 'https://soundcloud.com/soft-cell-official/albums',
778 'info_dict': {
779 'id': '207965082',
780 'title': 'Soft Cell (Albums)',
781 },
782 'playlist_mincount': 1,
783 }, {
784 'url': 'https://soundcloud.com/jcv246/sets',
80fb6d4a 785 'info_dict': {
b6423e6c 786 'id': '12982173',
548c3957 787 'title': 'Jordi / cv (Sets)',
80fb6d4a 788 },
8e45e1cc 789 'playlist_mincount': 2,
80fb6d4a 790 }, {
b6423e6c 791 'url': 'https://soundcloud.com/jcv246/reposts',
80fb6d4a 792 'info_dict': {
b6423e6c
S
793 'id': '12982173',
794 'title': 'Jordi / cv (Reposts)',
80fb6d4a 795 },
b6423e6c 796 'playlist_mincount': 6,
80fb6d4a 797 }, {
b6423e6c 798 'url': 'https://soundcloud.com/clalberg/likes',
80fb6d4a 799 'info_dict': {
b6423e6c
S
800 'id': '11817582',
801 'title': 'clalberg (Likes)',
80fb6d4a 802 },
b6423e6c 803 'playlist_mincount': 5,
80fb6d4a
S
804 }, {
805 'url': 'https://soundcloud.com/grynpyret/spotlight',
806 'info_dict': {
807 'id': '7098329',
bf2dc9cc 808 'title': 'Grynpyret (Spotlight)',
80fb6d4a
S
809 },
810 'playlist_mincount': 1,
22a6f150 811 }]
92790f4e 812
80fb6d4a 813 _BASE_URL_MAP = {
548c3957
RA
814 'all': 'stream/users/%s',
815 'tracks': 'users/%s/tracks',
816 'albums': 'users/%s/albums',
817 'sets': 'users/%s/playlists',
818 'reposts': 'stream/users/%s/reposts',
819 'likes': 'users/%s/likes',
820 'spotlight': 'users/%s/spotlight',
80fb6d4a
S
821 }
822
92790f4e 823 def _real_extract(self, url):
5ad28e7f 824 mobj = self._match_valid_url(url)
92790f4e
JMF
825 uploader = mobj.group('user')
826
20991253 827 user = self._download_json(
548c3957 828 self._resolv_url(self._BASE_URL + uploader),
fb4126a1 829 uploader, 'Downloading user info', headers=self._HEADERS)
80fb6d4a
S
830
831 resource = mobj.group('rsrc') or 'all'
80fb6d4a 832
836ef264 833 return self._extract_playlist(
548c3957
RA
834 self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'],
835 str_or_none(user.get('id')),
836 '%s (%s)' % (user['username'], resource.capitalize()))
97afd99a 837
92790f4e 838
e107c2b8 839class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE):
840 _VALID_URL = r'https?://api\.soundcloud\.com/users/(?P<id>\d+)'
841 IE_NAME = 'soundcloud:user:permalink'
842 _TESTS = [{
843 'url': 'https://api.soundcloud.com/users/30909869',
844 'info_dict': {
845 'id': '30909869',
846 'title': 'neilcic',
847 },
848 'playlist_mincount': 23,
849 }]
850
851 def _real_extract(self, url):
852 user_id = self._match_id(url)
853 user = self._download_json(
854 self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS)
855
856 return self._extract_playlist(
857 f'{self._API_V2_BASE}stream/users/{user["id"]}', str(user['id']), user.get('username'))
858
859
836ef264
S
860class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
861 _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
862 IE_NAME = 'soundcloud:trackstation'
863 _TESTS = [{
864 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
865 'info_dict': {
866 'id': '286017854',
548c3957 867 'title': 'Track station: your text',
836ef264
S
868 },
869 'playlist_mincount': 47,
870 }]
80fb6d4a 871
836ef264
S
872 def _real_extract(self, url):
873 track_name = self._match_id(url)
80fb6d4a 874
fb4126a1 875 track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS)
836ef264 876 track_id = self._search_regex(
548c3957 877 r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
92790f4e 878
836ef264 879 return self._extract_playlist(
548c3957
RA
880 self._API_V2_BASE + 'stations/%s/tracks' % track['id'],
881 track_id, 'Track station: %s' % track['title'])
20991253
PH
882
883
c2dedf12
L
884class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
885 _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<slug>[\w\d-]+/[\w\d-]+)/(?P<relation>albums|sets|recommended)'
886 IE_NAME = 'soundcloud:related'
887 _TESTS = [{
888 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/recommended',
889 'info_dict': {
890 'id': '1084577272',
891 'title': 'Sexapil - Pingers 5 (Recommended)',
892 },
893 'playlist_mincount': 50,
894 }, {
895 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/albums',
896 'info_dict': {
897 'id': '1084577272',
898 'title': 'Sexapil - Pingers 5 (Albums)',
899 },
900 'playlist_mincount': 1,
901 }, {
902 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/sets',
903 'info_dict': {
904 'id': '1084577272',
905 'title': 'Sexapil - Pingers 5 (Sets)',
906 },
907 'playlist_mincount': 4,
908 }]
909
910 _BASE_URL_MAP = {
911 'albums': 'tracks/%s/albums',
912 'sets': 'tracks/%s/playlists_without_albums',
913 'recommended': 'tracks/%s/related',
914 }
915
916 def _real_extract(self, url):
917 slug, relation = self._match_valid_url(url).group('slug', 'relation')
918
919 track = self._download_json(
920 self._resolv_url(self._BASE_URL + slug),
921 slug, 'Downloading track info', headers=self._HEADERS)
922
923 if track.get('errors'):
924 raise ExtractorError(f'{self.IE_NAME} said: %s' % ','.join(
925 str(err['error_message']) for err in track['errors']), expected=True)
926
927 return self._extract_playlist(
928 self._API_V2_BASE + self._BASE_URL_MAP[relation] % track['id'], str(track['id']),
929 '%s (%s)' % (track.get('title') or slug, relation.capitalize()))
930
931
7518a61d 932class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
548c3957 933 _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
20991253 934 IE_NAME = 'soundcloud:playlist'
46f74bcf 935 _TESTS = [{
f1c05100 936 'url': 'https://api.soundcloud.com/playlists/4110309',
46f74bcf
PH
937 'info_dict': {
938 'id': '4110309',
939 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
940 'description': 're:.*?TILT Brass - Bowery Poetry Club',
941 },
942 'playlist_count': 6,
943 }]
20991253
PH
944
945 def _real_extract(self, url):
5ad28e7f 946 mobj = self._match_valid_url(url)
20991253 947 playlist_id = mobj.group('id')
20991253 948
3bed6217 949 query = {}
2f834e93 950 token = mobj.group('token')
2f834e93 951 if token:
548c3957 952 query['secret_token'] = token
2f834e93 953
20991253 954 data = self._download_json(
548c3957 955 self._API_V2_BASE + 'playlists/' + playlist_id,
fb4126a1 956 playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
20991253 957
2a5c26c9 958 return self._extract_set(data, token)
2abf7cab 959
960
63ccf4ff 961class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor):
2abf7cab 962 IE_NAME = 'soundcloud:search'
96565c7e 963 IE_DESC = 'Soundcloud search'
964 _SEARCH_KEY = 'scsearch'
2abf7cab 965 _TESTS = [{
966 'url': 'scsearch15:post-avant jazzcore',
967 'info_dict': {
c04cc2e2 968 'id': 'post-avant jazzcore',
2abf7cab 969 'title': 'post-avant jazzcore',
970 },
971 'playlist_count': 15,
972 }]
973
328a22e1 974 _MAX_RESULTS_PER_PAGE = 200
975 _DEFAULT_RESULTS_PER_PAGE = 50
2abf7cab 976
977 def _get_collection(self, endpoint, collection_id, **query):
a3372437 978 limit = min(
328a22e1 979 query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
980 self._MAX_RESULTS_PER_PAGE)
548c3957
RA
981 query.update({
982 'limit': limit,
548c3957
RA
983 'linked_partitioning': 1,
984 'offset': 0,
985 })
986 next_url = update_url_query(self._API_V2_BASE + endpoint, query)
2abf7cab 987
f6c903e7 988 for i in itertools.count(1):
7e347275 989 response = self._download_json(
cc16383f 990 next_url, collection_id, f'Downloading page {i}',
fb4126a1 991 'Unable to download API page', headers=self._HEADERS)
2abf7cab 992
cc16383f 993 for item in response.get('collection') or []:
994 if item:
c04cc2e2
KAW
995 yield self.url_result(
996 item['uri'], SoundcloudIE.ie_key(), **self._extract_info_dict(item, extract_flat=True))
2abf7cab 997
7e347275 998 next_url = response.get('next_href')
f6c903e7
S
999 if not next_url:
1000 break
2abf7cab 1001
1002 def _get_n_results(self, query, n):
8863c8f0 1003 return self.playlist_result(itertools.islice(
1004 self._get_collection('search/tracks', query, limit=n, q=query),
1005 0, None if n == float('inf') else n), query, query)