[yt-dlp.git] / youtube_dl / extractor / soundcloud.py

# encoding: utf-8
from __future__ import unicode_literals

import json
import re
import itertools

from .common import InfoExtractor
from ..utils import (
    compat_str,
    compat_urlparse,
    compat_urllib_parse,

    ExtractorError,
    unified_strdate,
)


class SoundcloudIE(InfoExtractor):
    """Information extractor for soundcloud.com
       To access the media, the uid of the song and a stream token
       must be extracted from the page source and the script must make
       a request to media.soundcloud.com/crossdomain.xml. Then
       the media can be grabbed by requesting from an url composed
       of the stream token and uid
     """

    _VALID_URL = r'''^(?:https?://)?
                    (?:(?:(?:www\.|m\.)?soundcloud\.com/
                            (?P<uploader>[\w\d-]+)/
                            (?!sets/)(?P<title>[\w\d-]+)/?
                            (?P<token>[^?]+?)?(?:[?].*)?$)
                       |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
                       |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
                    )
                    '''
    IE_NAME = 'soundcloud'
    _TESTS = [
        {
            'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
            'file': '62986583.mp3',
            'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
            'info_dict': {
                "upload_date": "20121011",
                "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
                "uploader": "E.T. ExTerrestrial Music",
                "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
            }
        },
        # not streamable song
        {
            'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
            'info_dict': {
                'id': '47127627',
                'ext': 'mp3',
                'title': 'Goldrushed',
                'uploader': 'The Royal Concept',
                'upload_date': '20120521',
            },
            'params': {
                # rtmp
                'skip_download': True,
            },
        },
        # private link
        {
            'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
            'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
            'info_dict': {
                'id': '123998367',
                'ext': 'mp3',
                'title': 'Youtube - Dl Test Video \'\' Ä↭',
                'uploader': 'jaimeMF',
                'description': 'test chars:  \"\'/\\ä↭',
                'upload_date': '20131209',
            },
        },
        # downloadable song
        {
            'url': 'https://soundcloud.com/simgretina/just-your-problem-baby-1',
            'md5': '56a8b69568acaa967b4c49f9d1d52d19',
            'info_dict': {
                'id': '105614606',
                'ext': 'wav',
                'title': 'Just Your Problem Baby (Acapella)',
                'description': 'Vocals',
                'uploader': 'Sim Gretina',
                'upload_date': '20130815',
            },
        },
    ]

    _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
    _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'

    @classmethod
    def suitable(cls, url):
        return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None

    def report_resolve(self, video_id):
        """Report information extraction."""
        self.to_screen(u'%s: Resolving id' % video_id)

    @classmethod
    def _resolv_url(cls, url):
        return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID

    def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
        track_id = compat_str(info['id'])
        name = full_title or track_id
        if quiet:
            self.report_extraction(name)

        thumbnail = info['artwork_url']
        if thumbnail is not None:
            thumbnail = thumbnail.replace('-large', '-t500x500')
        ext = 'mp3'
        result = {
            'id': track_id,
            'uploader': info['user']['username'],
            'upload_date': unified_strdate(info['created_at']),
            'title': info['title'],
            'description': info['description'],
            'thumbnail': thumbnail,
        }
        if info.get('downloadable', False):
            # We can build a direct link to the song
            format_url = (
                'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
                    track_id, self._CLIENT_ID))
            result['formats'] = [{
                'format_id': 'download',
                'ext': info.get('original_format', 'mp3'),
                'url': format_url,
                'vcodec': 'none',
            }]
        else:
            # We have to retrieve the url
            streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
                'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
            stream_json = self._download_webpage(
                streams_url,
                track_id, 'Downloading track url')

            formats = []
            format_dict = json.loads(stream_json)
            for key, stream_url in format_dict.items():
                if key.startswith(u'http'):
                    formats.append({
                        'format_id': key,
                        'ext': ext,
                        'url': stream_url,
                        'vcodec': 'none',
                    })
                elif key.startswith(u'rtmp'):
                    # The url doesn't have an rtmp app, we have to extract the playpath
                    url, path = stream_url.split('mp3:', 1)
                    formats.append({
                        'format_id': key,
                        'url': url,
                        'play_path': 'mp3:' + path,
                        'ext': ext,
                        'vcodec': 'none',
                    })

            if not formats:
                # We fallback to the stream_url in the original info, this
                # cannot be always used, sometimes it can give an HTTP 404 error
                formats.append({
                    'format_id': 'fallback',
                    'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
                    'ext': ext,
                    'vcodec': 'none',
                })

            for f in formats:
                if f['format_id'].startswith('http'):
                    f['protocol'] = 'http'
                if f['format_id'].startswith('rtmp'):
                    f['protocol'] = 'rtmp'

            self._sort_formats(formats)
            result['formats'] = formats

        return result

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
        if mobj is None:
            raise ExtractorError(u'Invalid URL: %s' % url)

        track_id = mobj.group('track_id')
        token = None
        if track_id is not None:
            info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
            full_title = track_id
        elif mobj.group('player'):
            query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
            return self.url_result(query['url'][0], ie='Soundcloud')
        else:
            # extract uploader (which is in the url)
            uploader = mobj.group('uploader')
            # extract simple title (uploader + slug of song title)
            slug_title =  mobj.group('title')
            token = mobj.group('token')
            full_title = resolve_title = '%s/%s' % (uploader, slug_title)
            if token:
                resolve_title += '/%s' % token
    
            self.report_resolve(full_title)
    
            url = 'http://soundcloud.com/%s' % resolve_title
            info_json_url = self._resolv_url(url)
        info_json = self._download_webpage(info_json_url, full_title, 'Downloading info JSON')

        info = json.loads(info_json)
        return self._extract_info_dict(info, full_title, secret_token=token)

class SoundcloudSetIE(SoundcloudIE):
    _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
    IE_NAME = 'soundcloud:set'
    # it's in tests/test_playlists.py
    _TESTS = []

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError(u'Invalid URL: %s' % url)

        # extract uploader (which is in the url)
        uploader = mobj.group(1)
        # extract simple title (uploader + slug of song title)
        slug_title =  mobj.group(2)
        full_title = '%s/sets/%s' % (uploader, slug_title)

        self.report_resolve(full_title)

        url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
        resolv_url = self._resolv_url(url)
        info_json = self._download_webpage(resolv_url, full_title)

        info = json.loads(info_json)
        if 'errors' in info:
            for err in info['errors']:
                self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
            return

        self.report_extraction(full_title)
        return {'_type': 'playlist',
                'entries': [self._extract_info_dict(track) for track in info['tracks']],
                'id': info['id'],
                'title': info['title'],
                }


class SoundcloudUserIE(SoundcloudIE):
    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
    IE_NAME = 'soundcloud:user'

    # it's in tests/test_playlists.py
    _TESTS = []

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader = mobj.group('user')

        url = 'http://soundcloud.com/%s/' % uploader
        resolv_url = self._resolv_url(url)
        user_json = self._download_webpage(resolv_url, uploader,
            'Downloading user info')
        user = json.loads(user_json)

        tracks = []
        for i in itertools.count():
            data = compat_urllib_parse.urlencode({'offset': i*50,
                                                  'client_id': self._CLIENT_ID,
                                                  })
            tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
            response = self._download_webpage(tracks_url, uploader, 
                'Downloading tracks page %s' % (i+1))
            new_tracks = json.loads(response)
            tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
            if len(new_tracks) < 50:
                break

        return {
            '_type': 'playlist',
            'id': compat_str(user['id']),
            'title': user['username'],
            'entries': tracks,
        }
Commit	Line	Data
de2dd4c5	1	# encoding: utf-8
fbcd7b5f PH	2	from __future__ import unicode_literals
fbcd7b5f PH	3
aad0d6d5 PH	4	import json
aad0d6d5 PH	5	import re
92790f4e	6	import itertools
aad0d6d5 PH	7
	8	from .common import InfoExtractor
	9	from ..utils import (
	10	compat_str,
668de34c	11	compat_urlparse,
92790f4e	12	compat_urllib_parse,
aad0d6d5 PH	13
	14	ExtractorError,
	15	unified_strdate,
	16	)
	17
	18
	19	class SoundcloudIE(InfoExtractor):
	20	"""Information extractor for soundcloud.com
	21	To access the media, the uid of the song and a stream token
	22	must be extracted from the page source and the script must make
	23	a request to media.soundcloud.com/crossdomain.xml. Then
	24	the media can be grabbed by requesting from an url composed
	25	of the stream token and uid
	26	"""
	27
eb6a41ba	28	_VALID_URL = r'''^(?:https?://)?
71507a11	29	(?:(?:(?:www\.\|m\.)?soundcloud\.com/
4ff50ef8 PH	30	(?P<uploader>[\w\d-]+)/
4ff50ef8 PH	31	(?!sets/)(?P<title>[\w\d-]+)/?
de2dd4c5	32	(?P<token>[^?]+?)?(?:[?].*)?$)
eb6a41ba	33	\|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
31c1cf5a	34	\|(?P<player>(?:w\|player\|p.)\.soundcloud\.com/player/?.?url=.)
eb6a41ba JMF	35	)
eb6a41ba JMF	36	'''
fbcd7b5f	37	IE_NAME = 'soundcloud'
12c167c8 JMF	38	_TESTS = [
12c167c8 JMF	39	{
fbcd7b5f PH	40	'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
	41	'file': '62986583.mp3',
	42	'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
	43	'info_dict': {
	44	"upload_date": "20121011",
	45	"description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
	46	"uploader": "E.T. ExTerrestrial Music",
	47	"title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
12c167c8 JMF	48	}
	49	},
	50	# not streamable song
	51	{
fbcd7b5f PH	52	'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
	53	'info_dict': {
	54	'id': '47127627',
	55	'ext': 'mp3',
	56	'title': 'Goldrushed',
	57	'uploader': 'The Royal Concept',
	58	'upload_date': '20120521',
12c167c8	59	},
fbcd7b5f	60	'params': {
12c167c8	61	# rtmp
fbcd7b5f	62	'skip_download': True,
12c167c8 JMF	63	},
12c167c8 JMF	64	},
de2dd4c5 JMF	65	# private link
de2dd4c5 JMF	66	{
fbcd7b5f PH	67	'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
	68	'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
	69	'info_dict': {
	70	'id': '123998367',
	71	'ext': 'mp3',
	72	'title': 'Youtube - Dl Test Video \'\' Ä↭',
	73	'uploader': 'jaimeMF',
	74	'description': 'test chars: \"\'/\\ä↭',
	75	'upload_date': '20131209',
de2dd4c5 JMF	76	},
de2dd4c5 JMF	77	},
f67ca84d JMF	78	# downloadable song
f67ca84d JMF	79	{
fbcd7b5f PH	80	'url': 'https://soundcloud.com/simgretina/just-your-problem-baby-1',
	81	'md5': '56a8b69568acaa967b4c49f9d1d52d19',
	82	'info_dict': {
	83	'id': '105614606',
	84	'ext': 'wav',
	85	'title': 'Just Your Problem Baby (Acapella)',
	86	'description': 'Vocals',
	87	'uploader': 'Sim Gretina',
	88	'upload_date': '20130815',
f67ca84d JMF	89	},
f67ca84d JMF	90	},
12c167c8	91	]
aad0d6d5	92
7d239269	93	_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
64bb5187	94	_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
7d239269	95
eb6a41ba JMF	96	@classmethod
	97	def suitable(cls, url):
	98	return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
	99
aad0d6d5 PH	100	def report_resolve(self, video_id):
	101	"""Report information extraction."""
	102	self.to_screen(u'%s: Resolving id' % video_id)
	103
7d239269 JMF	104	@classmethod
	105	def _resolv_url(cls, url):
	106	return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
	107
de2dd4c5	108	def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
12c167c8 JMF	109	track_id = compat_str(info['id'])
12c167c8 JMF	110	name = full_title or track_id
2a15e706	111	if quiet:
92790f4e	112	self.report_extraction(name)
7d239269 JMF	113
	114	thumbnail = info['artwork_url']
	115	if thumbnail is not None:
	116	thumbnail = thumbnail.replace('-large', '-t500x500')
fbcd7b5f	117	ext = 'mp3'
12c167c8	118	result = {
2a15e706	119	'id': track_id,
7d239269 JMF	120	'uploader': info['user']['username'],
7d239269 JMF	121	'upload_date': unified_strdate(info['created_at']),
2a15e706	122	'title': info['title'],
7d239269 JMF	123	'description': info['description'],
	124	'thumbnail': thumbnail,
	125	}
12c167c8	126	if info.get('downloadable', False):
64bb5187	127	# We can build a direct link to the song
2a15e706	128	format_url = (
fbcd7b5f	129	'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
2a15e706 PH	130	track_id, self._CLIENT_ID))
	131	result['formats'] = [{
	132	'format_id': 'download',
fbcd7b5f	133	'ext': info.get('original_format', 'mp3'),
2a15e706	134	'url': format_url,
fb04e403	135	'vcodec': 'none',
2a15e706	136	}]
64bb5187 JMF	137	else:
64bb5187 JMF	138	# We have to retrieve the url
de2dd4c5 JMF	139	streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
de2dd4c5 JMF	140	'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
12c167c8	141	stream_json = self._download_webpage(
de2dd4c5	142	streams_url,
fbcd7b5f	143	track_id, 'Downloading track url')
2a15e706 PH	144
	145	formats = []
	146	format_dict = json.loads(stream_json)
	147	for key, stream_url in format_dict.items():
	148	if key.startswith(u'http'):
	149	formats.append({
	150	'format_id': key,
	151	'ext': ext,
	152	'url': stream_url,
fb04e403	153	'vcodec': 'none',
2a15e706 PH	154	})
	155	elif key.startswith(u'rtmp'):
	156	# The url doesn't have an rtmp app, we have to extract the playpath
	157	url, path = stream_url.split('mp3:', 1)
	158	formats.append({
	159	'format_id': key,
	160	'url': url,
	161	'play_path': 'mp3:' + path,
	162	'ext': ext,
fb04e403	163	'vcodec': 'none',
2a15e706 PH	164	})
	165
	166	if not formats:
64bb5187 JMF	167	# We fallback to the stream_url in the original info, this
64bb5187 JMF	168	# cannot be always used, sometimes it can give an HTTP 404 error
2a15e706	169	formats.append({
fbcd7b5f	170	'format_id': 'fallback',
2a15e706 PH	171	'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
2a15e706 PH	172	'ext': ext,
fb04e403	173	'vcodec': 'none',
2a15e706 PH	174	})
2a15e706 PH	175
fbcd7b5f	176	for f in formats:
2a15e706	177	if f['format_id'].startswith('http'):
fbcd7b5f	178	f['protocol'] = 'http'
2a15e706	179	if f['format_id'].startswith('rtmp'):
fbcd7b5f	180	f['protocol'] = 'rtmp'
2a15e706	181
fbcd7b5f	182	self._sort_formats(formats)
2a15e706	183	result['formats'] = formats
64bb5187	184
12c167c8	185	return result
7d239269	186
aad0d6d5	187	def _real_extract(self, url):
eb6a41ba	188	mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
aad0d6d5 PH	189	if mobj is None:
	190	raise ExtractorError(u'Invalid URL: %s' % url)
	191
eb6a41ba	192	track_id = mobj.group('track_id')
de2dd4c5	193	token = None
eb6a41ba JMF	194	if track_id is not None:
	195	info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
	196	full_title = track_id
31c1cf5a	197	elif mobj.group('player'):
668de34c JMF	198	query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
668de34c JMF	199	return self.url_result(query['url'][0], ie='Soundcloud')
eb6a41ba JMF	200	else:
eb6a41ba JMF	201	# extract uploader (which is in the url)
de2dd4c5	202	uploader = mobj.group('uploader')
eb6a41ba	203	# extract simple title (uploader + slug of song title)
de2dd4c5 JMF	204	slug_title = mobj.group('title')
	205	token = mobj.group('token')
	206	full_title = resolve_title = '%s/%s' % (uploader, slug_title)
	207	if token:
	208	resolve_title += '/%s' % token
eb6a41ba JMF	209
	210	self.report_resolve(full_title)
	211
de2dd4c5	212	url = 'http://soundcloud.com/%s' % resolve_title
eb6a41ba	213	info_json_url = self._resolv_url(url)
fbcd7b5f	214	info_json = self._download_webpage(info_json_url, full_title, 'Downloading info JSON')
aad0d6d5 PH	215
aad0d6d5 PH	216	info = json.loads(info_json)
de2dd4c5	217	return self._extract_info_dict(info, full_title, secret_token=token)
aad0d6d5	218
7d239269	219	class SoundcloudSetIE(SoundcloudIE):
20db33e2	220	_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
fbcd7b5f	221	IE_NAME = 'soundcloud:set'
12c167c8 JMF	222	# it's in tests/test_playlists.py
12c167c8 JMF	223	_TESTS = []
aad0d6d5	224
aad0d6d5 PH	225	def _real_extract(self, url):
	226	mobj = re.match(self._VALID_URL, url)
	227	if mobj is None:
	228	raise ExtractorError(u'Invalid URL: %s' % url)
	229
	230	# extract uploader (which is in the url)
	231	uploader = mobj.group(1)
	232	# extract simple title (uploader + slug of song title)
	233	slug_title = mobj.group(2)
	234	full_title = '%s/sets/%s' % (uploader, slug_title)
	235
	236	self.report_resolve(full_title)
	237
	238	url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
7d239269	239	resolv_url = self._resolv_url(url)
aad0d6d5 PH	240	info_json = self._download_webpage(resolv_url, full_title)
aad0d6d5 PH	241
aad0d6d5 PH	242	info = json.loads(info_json)
	243	if 'errors' in info:
	244	for err in info['errors']:
	245	self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
	246	return
	247
	248	self.report_extraction(full_title)
7d239269 JMF	249	return {'_type': 'playlist',
	250	'entries': [self._extract_info_dict(track) for track in info['tracks']],
	251	'id': info['id'],
	252	'title': info['title'],
	253	}
92790f4e JMF	254
	255
	256	class SoundcloudUserIE(SoundcloudIE):
c0ade33e	257	_VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
fbcd7b5f	258	IE_NAME = 'soundcloud:user'
92790f4e JMF	259
92790f4e JMF	260	# it's in tests/test_playlists.py
12c167c8	261	_TESTS = []
92790f4e JMF	262
	263	def _real_extract(self, url):
	264	mobj = re.match(self._VALID_URL, url)
	265	uploader = mobj.group('user')
	266
	267	url = 'http://soundcloud.com/%s/' % uploader
	268	resolv_url = self._resolv_url(url)
	269	user_json = self._download_webpage(resolv_url, uploader,
fbcd7b5f	270	'Downloading user info')
92790f4e JMF	271	user = json.loads(user_json)
	272
	273	tracks = []
	274	for i in itertools.count():
	275	data = compat_urllib_parse.urlencode({'offset': i*50,
	276	'client_id': self._CLIENT_ID,
	277	})
	278	tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
	279	response = self._download_webpage(tracks_url, uploader,
fbcd7b5f	280	'Downloading tracks page %s' % (i+1))
92790f4e JMF	281	new_tracks = json.loads(response)
	282	tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
	283	if len(new_tracks) < 50:
	284	break
	285
	286	return {
	287	'_type': 'playlist',
	288	'id': compat_str(user['id']),
	289	'title': user['username'],
	290	'entries': tracks,
	291	}