[yt-dlp.git] / youtube_dl / extractor / soundcloud.py

# encoding: utf-8
from __future__ import unicode_literals

import json
import re
import itertools

from .common import InfoExtractor
from ..utils import (
    compat_str,
    compat_urlparse,
    compat_urllib_parse,

    ExtractorError,
    unified_strdate,
)


class SoundcloudIE(InfoExtractor):
    """Information extractor for soundcloud.com
       To access the media, the uid of the song and a stream token
       must be extracted from the page source and the script must make
       a request to media.soundcloud.com/crossdomain.xml. Then
       the media can be grabbed by requesting from an url composed
       of the stream token and uid
     """

    _VALID_URL = r'''^(?:https?://)?
                    (?:(?:(?:www\.|m\.)?soundcloud\.com/
                            (?P<uploader>[\w\d-]+)/
                            (?!sets/)(?P<title>[\w\d-]+)/?
                            (?P<token>[^?]+?)?(?:[?].*)?$)
                       |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
                       |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
                    )
                    '''
    IE_NAME = 'soundcloud'
    _TESTS = [
        {
            'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
            'file': '62986583.mp3',
            'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
            'info_dict': {
                "upload_date": "20121011",
                "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
                "uploader": "E.T. ExTerrestrial Music",
                "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
            }
        },
        # not streamable song
        {
            'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
            'info_dict': {
                'id': '47127627',
                'ext': 'mp3',
                'title': 'Goldrushed',
                'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
                'uploader': 'The Royal Concept',
                'upload_date': '20120521',
            },
            'params': {
                # rtmp
                'skip_download': True,
            },
        },
        # private link
        {
            'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
            'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
            'info_dict': {
                'id': '123998367',
                'ext': 'mp3',
                'title': 'Youtube - Dl Test Video \'\' Ä↭',
                'uploader': 'jaimeMF',
                'description': 'test chars:  \"\'/\\ä↭',
                'upload_date': '20131209',
            },
        },
        # downloadable song
        {
            'url': 'https://soundcloud.com/simgretina/just-your-problem-baby-1',
            'md5': '56a8b69568acaa967b4c49f9d1d52d19',
            'info_dict': {
                'id': '105614606',
                'ext': 'wav',
                'title': 'Just Your Problem Baby (Acapella)',
                'description': 'Vocals',
                'uploader': 'Sim Gretina',
                'upload_date': '20130815',
            },
        },
    ]

    _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
    _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'

    @classmethod
    def suitable(cls, url):
        return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None

    def report_resolve(self, video_id):
        """Report information extraction."""
        self.to_screen('%s: Resolving id' % video_id)

    @classmethod
    def _resolv_url(cls, url):
        return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID

    def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
        track_id = compat_str(info['id'])
        name = full_title or track_id
        if quiet:
            self.report_extraction(name)

        thumbnail = info['artwork_url']
        if thumbnail is not None:
            thumbnail = thumbnail.replace('-large', '-t500x500')
        ext = 'mp3'
        result = {
            'id': track_id,
            'uploader': info['user']['username'],
            'upload_date': unified_strdate(info['created_at']),
            'title': info['title'],
            'description': info['description'],
            'thumbnail': thumbnail,
        }
        formats = []
        if info.get('downloadable', False):
            # We can build a direct link to the song
            format_url = (
                'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
                    track_id, self._CLIENT_ID))
            formats.append({
                'format_id': 'download',
                'ext': info.get('original_format', 'mp3'),
                'url': format_url,
                'vcodec': 'none',
                'preference': 10,
            })

        # We have to retrieve the url
        streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
            'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
        stream_json = self._download_webpage(
            streams_url,
            track_id, 'Downloading track url')

        format_dict = json.loads(stream_json)
        for key, stream_url in format_dict.items():
            if key.startswith('http'):
                formats.append({
                    'format_id': key,
                    'ext': ext,
                    'url': stream_url,
                    'vcodec': 'none',
                })
            elif key.startswith('rtmp'):
                # The url doesn't have an rtmp app, we have to extract the playpath
                url, path = stream_url.split('mp3:', 1)
                formats.append({
                    'format_id': key,
                    'url': url,
                    'play_path': 'mp3:' + path,
                    'ext': ext,
                    'vcodec': 'none',
                })

            if not formats:
                # We fallback to the stream_url in the original info, this
                # cannot be always used, sometimes it can give an HTTP 404 error
                formats.append({
                    'format_id': 'fallback',
                    'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
                    'ext': ext,
                    'vcodec': 'none',
                })

            for f in formats:
                if f['format_id'].startswith('http'):
                    f['protocol'] = 'http'
                if f['format_id'].startswith('rtmp'):
                    f['protocol'] = 'rtmp'

            self._sort_formats(formats)
            result['formats'] = formats

        return result

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)

        track_id = mobj.group('track_id')
        token = None
        if track_id is not None:
            info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
            full_title = track_id
        elif mobj.group('player'):
            query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
            return self.url_result(query['url'][0], ie='Soundcloud')
        else:
            # extract uploader (which is in the url)
            uploader = mobj.group('uploader')
            # extract simple title (uploader + slug of song title)
            slug_title =  mobj.group('title')
            token = mobj.group('token')
            full_title = resolve_title = '%s/%s' % (uploader, slug_title)
            if token:
                resolve_title += '/%s' % token
    
            self.report_resolve(full_title)
    
            url = 'http://soundcloud.com/%s' % resolve_title
            info_json_url = self._resolv_url(url)
        info_json = self._download_webpage(info_json_url, full_title, 'Downloading info JSON')

        info = json.loads(info_json)
        return self._extract_info_dict(info, full_title, secret_token=token)

class SoundcloudSetIE(SoundcloudIE):
    _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
    IE_NAME = 'soundcloud:set'
    # it's in tests/test_playlists.py
    _TESTS = []

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)

        # extract uploader (which is in the url)
        uploader = mobj.group(1)
        # extract simple title (uploader + slug of song title)
        slug_title =  mobj.group(2)
        full_title = '%s/sets/%s' % (uploader, slug_title)

        self.report_resolve(full_title)

        url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
        resolv_url = self._resolv_url(url)
        info_json = self._download_webpage(resolv_url, full_title)

        info = json.loads(info_json)
        if 'errors' in info:
            for err in info['errors']:
                self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
            return

        self.report_extraction(full_title)
        return {'_type': 'playlist',
                'entries': [self._extract_info_dict(track) for track in info['tracks']],
                'id': info['id'],
                'title': info['title'],
                }


class SoundcloudUserIE(SoundcloudIE):
    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
    IE_NAME = 'soundcloud:user'

    # it's in tests/test_playlists.py
    _TESTS = []

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader = mobj.group('user')

        url = 'http://soundcloud.com/%s/' % uploader
        resolv_url = self._resolv_url(url)
        user_json = self._download_webpage(resolv_url, uploader,
            'Downloading user info')
        user = json.loads(user_json)

        tracks = []
        for i in itertools.count():
            data = compat_urllib_parse.urlencode({'offset': i*50,
                                                  'client_id': self._CLIENT_ID,
                                                  })
            tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
            response = self._download_webpage(tracks_url, uploader, 
                'Downloading tracks page %s' % (i+1))
            new_tracks = json.loads(response)
            tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
            if len(new_tracks) < 50:
                break

        return {
            '_type': 'playlist',
            'id': compat_str(user['id']),
            'title': user['username'],
            'entries': tracks,
        }
Commit	Line	Data
de2dd4c5	1	# encoding: utf-8
fbcd7b5f PH	2	from __future__ import unicode_literals
fbcd7b5f PH	3
aad0d6d5 PH	4	import json
aad0d6d5 PH	5	import re
92790f4e	6	import itertools
aad0d6d5 PH	7
	8	from .common import InfoExtractor
	9	from ..utils import (
	10	compat_str,
668de34c	11	compat_urlparse,
92790f4e	12	compat_urllib_parse,
aad0d6d5 PH	13
	14	ExtractorError,
	15	unified_strdate,
	16	)
	17
	18
	19	class SoundcloudIE(InfoExtractor):
	20	"""Information extractor for soundcloud.com
	21	To access the media, the uid of the song and a stream token
	22	must be extracted from the page source and the script must make
	23	a request to media.soundcloud.com/crossdomain.xml. Then
	24	the media can be grabbed by requesting from an url composed
	25	of the stream token and uid
	26	"""
	27
eb6a41ba	28	_VALID_URL = r'''^(?:https?://)?
71507a11	29	(?:(?:(?:www\.\|m\.)?soundcloud\.com/
4ff50ef8 PH	30	(?P<uploader>[\w\d-]+)/
4ff50ef8 PH	31	(?!sets/)(?P<title>[\w\d-]+)/?
de2dd4c5	32	(?P<token>[^?]+?)?(?:[?].*)?$)
eb6a41ba	33	\|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
31c1cf5a	34	\|(?P<player>(?:w\|player\|p.)\.soundcloud\.com/player/?.?url=.)
eb6a41ba JMF	35	)
eb6a41ba JMF	36	'''
fbcd7b5f	37	IE_NAME = 'soundcloud'
12c167c8 JMF	38	_TESTS = [
12c167c8 JMF	39	{
fbcd7b5f PH	40	'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
	41	'file': '62986583.mp3',
	42	'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
	43	'info_dict': {
	44	"upload_date": "20121011",
	45	"description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
	46	"uploader": "E.T. ExTerrestrial Music",
	47	"title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
12c167c8 JMF	48	}
	49	},
	50	# not streamable song
	51	{
fbcd7b5f PH	52	'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
	53	'info_dict': {
	54	'id': '47127627',
	55	'ext': 'mp3',
	56	'title': 'Goldrushed',
63ad0315	57	'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
fbcd7b5f PH	58	'uploader': 'The Royal Concept',
fbcd7b5f PH	59	'upload_date': '20120521',
12c167c8	60	},
fbcd7b5f	61	'params': {
12c167c8	62	# rtmp
fbcd7b5f	63	'skip_download': True,
12c167c8 JMF	64	},
12c167c8 JMF	65	},
de2dd4c5 JMF	66	# private link
de2dd4c5 JMF	67	{
fbcd7b5f PH	68	'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
	69	'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
	70	'info_dict': {
	71	'id': '123998367',
	72	'ext': 'mp3',
	73	'title': 'Youtube - Dl Test Video \'\' Ä↭',
	74	'uploader': 'jaimeMF',
	75	'description': 'test chars: \"\'/\\ä↭',
	76	'upload_date': '20131209',
de2dd4c5 JMF	77	},
de2dd4c5 JMF	78	},
f67ca84d JMF	79	# downloadable song
f67ca84d JMF	80	{
fbcd7b5f PH	81	'url': 'https://soundcloud.com/simgretina/just-your-problem-baby-1',
	82	'md5': '56a8b69568acaa967b4c49f9d1d52d19',
	83	'info_dict': {
	84	'id': '105614606',
	85	'ext': 'wav',
	86	'title': 'Just Your Problem Baby (Acapella)',
	87	'description': 'Vocals',
	88	'uploader': 'Sim Gretina',
	89	'upload_date': '20130815',
f67ca84d JMF	90	},
f67ca84d JMF	91	},
12c167c8	92	]
aad0d6d5	93
7d239269	94	_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
64bb5187	95	_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
7d239269	96
eb6a41ba JMF	97	@classmethod
	98	def suitable(cls, url):
	99	return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
	100
aad0d6d5 PH	101	def report_resolve(self, video_id):
aad0d6d5 PH	102	"""Report information extraction."""
83622b6d	103	self.to_screen('%s: Resolving id' % video_id)
aad0d6d5	104
7d239269 JMF	105	@classmethod
	106	def _resolv_url(cls, url):
	107	return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
	108
de2dd4c5	109	def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
12c167c8 JMF	110	track_id = compat_str(info['id'])
12c167c8 JMF	111	name = full_title or track_id
2a15e706	112	if quiet:
92790f4e	113	self.report_extraction(name)
7d239269 JMF	114
	115	thumbnail = info['artwork_url']
	116	if thumbnail is not None:
	117	thumbnail = thumbnail.replace('-large', '-t500x500')
fbcd7b5f	118	ext = 'mp3'
12c167c8	119	result = {
2a15e706	120	'id': track_id,
7d239269 JMF	121	'uploader': info['user']['username'],
7d239269 JMF	122	'upload_date': unified_strdate(info['created_at']),
2a15e706	123	'title': info['title'],
7d239269 JMF	124	'description': info['description'],
	125	'thumbnail': thumbnail,
	126	}
5e114e4b	127	formats = []
12c167c8	128	if info.get('downloadable', False):
64bb5187	129	# We can build a direct link to the song
2a15e706	130	format_url = (
fbcd7b5f	131	'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
2a15e706	132	track_id, self._CLIENT_ID))
5e114e4b	133	formats.append({
2a15e706	134	'format_id': 'download',
fbcd7b5f	135	'ext': info.get('original_format', 'mp3'),
2a15e706	136	'url': format_url,
fb04e403	137	'vcodec': 'none',
5e114e4b PH	138	'preference': 10,
	139	})
	140
	141	# We have to retrieve the url
	142	streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
	143	'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
	144	stream_json = self._download_webpage(
	145	streams_url,
	146	track_id, 'Downloading track url')
	147
	148	format_dict = json.loads(stream_json)
	149	for key, stream_url in format_dict.items():
	150	if key.startswith('http'):
	151	formats.append({
	152	'format_id': key,
	153	'ext': ext,
	154	'url': stream_url,
	155	'vcodec': 'none',
	156	})
	157	elif key.startswith('rtmp'):
	158	# The url doesn't have an rtmp app, we have to extract the playpath
	159	url, path = stream_url.split('mp3:', 1)
	160	formats.append({
	161	'format_id': key,
	162	'url': url,
	163	'play_path': 'mp3:' + path,
	164	'ext': ext,
	165	'vcodec': 'none',
	166	})
2a15e706 PH	167
2a15e706 PH	168	if not formats:
64bb5187 JMF	169	# We fallback to the stream_url in the original info, this
64bb5187 JMF	170	# cannot be always used, sometimes it can give an HTTP 404 error
2a15e706	171	formats.append({
fbcd7b5f	172	'format_id': 'fallback',
2a15e706 PH	173	'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
2a15e706 PH	174	'ext': ext,
fb04e403	175	'vcodec': 'none',
2a15e706 PH	176	})
2a15e706 PH	177
fbcd7b5f	178	for f in formats:
2a15e706	179	if f['format_id'].startswith('http'):
fbcd7b5f	180	f['protocol'] = 'http'
2a15e706	181	if f['format_id'].startswith('rtmp'):
fbcd7b5f	182	f['protocol'] = 'rtmp'
2a15e706	183
fbcd7b5f	184	self._sort_formats(formats)
2a15e706	185	result['formats'] = formats
64bb5187	186
12c167c8	187	return result
7d239269	188
aad0d6d5	189	def _real_extract(self, url):
eb6a41ba	190	mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
aad0d6d5	191	if mobj is None:
83622b6d	192	raise ExtractorError('Invalid URL: %s' % url)
aad0d6d5	193
eb6a41ba	194	track_id = mobj.group('track_id')
de2dd4c5	195	token = None
eb6a41ba JMF	196	if track_id is not None:
	197	info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
	198	full_title = track_id
31c1cf5a	199	elif mobj.group('player'):
668de34c JMF	200	query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
668de34c JMF	201	return self.url_result(query['url'][0], ie='Soundcloud')
eb6a41ba JMF	202	else:
eb6a41ba JMF	203	# extract uploader (which is in the url)
de2dd4c5	204	uploader = mobj.group('uploader')
eb6a41ba	205	# extract simple title (uploader + slug of song title)
de2dd4c5 JMF	206	slug_title = mobj.group('title')
	207	token = mobj.group('token')
	208	full_title = resolve_title = '%s/%s' % (uploader, slug_title)
	209	if token:
	210	resolve_title += '/%s' % token
eb6a41ba JMF	211
	212	self.report_resolve(full_title)
	213
de2dd4c5	214	url = 'http://soundcloud.com/%s' % resolve_title
eb6a41ba	215	info_json_url = self._resolv_url(url)
fbcd7b5f	216	info_json = self._download_webpage(info_json_url, full_title, 'Downloading info JSON')
aad0d6d5 PH	217
aad0d6d5 PH	218	info = json.loads(info_json)
de2dd4c5	219	return self._extract_info_dict(info, full_title, secret_token=token)
aad0d6d5	220
7d239269	221	class SoundcloudSetIE(SoundcloudIE):
9d3f7781	222	_VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
fbcd7b5f	223	IE_NAME = 'soundcloud:set'
12c167c8 JMF	224	# it's in tests/test_playlists.py
12c167c8 JMF	225	_TESTS = []
aad0d6d5	226
aad0d6d5 PH	227	def _real_extract(self, url):
	228	mobj = re.match(self._VALID_URL, url)
	229	if mobj is None:
83622b6d	230	raise ExtractorError('Invalid URL: %s' % url)
aad0d6d5 PH	231
	232	# extract uploader (which is in the url)
	233	uploader = mobj.group(1)
	234	# extract simple title (uploader + slug of song title)
	235	slug_title = mobj.group(2)
	236	full_title = '%s/sets/%s' % (uploader, slug_title)
	237
	238	self.report_resolve(full_title)
	239
	240	url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
7d239269	241	resolv_url = self._resolv_url(url)
aad0d6d5 PH	242	info_json = self._download_webpage(resolv_url, full_title)
aad0d6d5 PH	243
aad0d6d5 PH	244	info = json.loads(info_json)
	245	if 'errors' in info:
	246	for err in info['errors']:
83622b6d	247	self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
aad0d6d5 PH	248	return
	249
	250	self.report_extraction(full_title)
7d239269 JMF	251	return {'_type': 'playlist',
	252	'entries': [self._extract_info_dict(track) for track in info['tracks']],
	253	'id': info['id'],
	254	'title': info['title'],
	255	}
92790f4e JMF	256
	257
	258	class SoundcloudUserIE(SoundcloudIE):
c0ade33e	259	_VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
fbcd7b5f	260	IE_NAME = 'soundcloud:user'
92790f4e JMF	261
92790f4e JMF	262	# it's in tests/test_playlists.py
12c167c8	263	_TESTS = []
92790f4e JMF	264
	265	def _real_extract(self, url):
	266	mobj = re.match(self._VALID_URL, url)
	267	uploader = mobj.group('user')
	268
	269	url = 'http://soundcloud.com/%s/' % uploader
	270	resolv_url = self._resolv_url(url)
	271	user_json = self._download_webpage(resolv_url, uploader,
fbcd7b5f	272	'Downloading user info')
92790f4e JMF	273	user = json.loads(user_json)
	274
	275	tracks = []
	276	for i in itertools.count():
	277	data = compat_urllib_parse.urlencode({'offset': i*50,
	278	'client_id': self._CLIENT_ID,
	279	})
	280	tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
	281	response = self._download_webpage(tracks_url, uploader,
fbcd7b5f	282	'Downloading tracks page %s' % (i+1))
92790f4e JMF	283	new_tracks = json.loads(response)
	284	tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
	285	if len(new_tracks) < 50:
	286	break
	287
	288	return {
	289	'_type': 'playlist',
	290	'id': compat_str(user['id']),
	291	'title': user['username'],
	292	'entries': tracks,
	293	}