]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/mixcloud.py
[packtpub] add support for authentication(closes #12622)
[yt-dlp.git] / youtube_dl / extractor / mixcloud.py
CommitLineData
d0390a0c
PH
1from __future__ import unicode_literals
2
dd91dfcd 3import base64
9c250931 4import functools
e6da9240 5import itertools
80cbb6dd 6import re
80cbb6dd
PH
7
8from .common import InfoExtractor
c96eca42 9from ..compat import (
dd91dfcd
YCH
10 compat_chr,
11 compat_ord,
c96eca42 12 compat_urllib_parse_unquote,
9c250931 13 compat_urlparse,
c96eca42 14)
1cc79574 15from ..utils import (
9c250931 16 clean_html,
baa7b197 17 ExtractorError,
9c250931 18 OnDemandPagedList,
b80505a4 19 str_to_int,
80cbb6dd
PH
20)
21
22
23class MixcloudIE(InfoExtractor):
655cb545 24 _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
d0390a0c 25 IE_NAME = 'mixcloud'
80cbb6dd 26
58ba6c01 27 _TESTS = [{
d0390a0c 28 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
d0390a0c 29 'info_dict': {
abb82f1d 30 'id': 'dholbach-cryptkeeper',
f896e1cc 31 'ext': 'm4a',
d0390a0c
PH
32 'title': 'Cryptkeeper',
33 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
34 'uploader': 'Daniel Holbach',
35 'uploader_id': 'dholbach',
ec85ded8 36 'thumbnail': r're:https?://.*\.jpg',
57c7411f 37 'view_count': int,
19e1d359 38 },
58ba6c01
S
39 }, {
40 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
41 'info_dict': {
42 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
7a757b71
JMF
43 'ext': 'mp3',
44 'title': 'Caribou 7 inch Vinyl Mix & Chat',
58ba6c01 45 'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
7a757b71 46 'uploader': 'Gilles Peterson Worldwide',
58ba6c01 47 'uploader_id': 'gillespeterson',
dd91dfcd 48 'thumbnail': 're:https?://.*',
58ba6c01 49 'view_count': int,
58ba6c01 50 },
655cb545
S
51 }, {
52 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
53 'only_matching': True,
58ba6c01 54 }]
80cbb6dd 55
dd91dfcd
YCH
56 # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
57 @staticmethod
58 def _decrypt_play_info(play_info):
59 KEY = 'pleasedontdownloadourmusictheartistswontgetpaid'
60
61 play_info = base64.b64decode(play_info.encode('ascii'))
62
63 return ''.join([
64 compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)]))
65 for idx, ch in enumerate(play_info)])
80cbb6dd 66
80cbb6dd
PH
67 def _real_extract(self, url):
68 mobj = re.match(self._VALID_URL, url)
19e1d359
JMF
69 uploader = mobj.group(1)
70 cloudcast_name = mobj.group(2)
c2daf8df 71 track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name)))
dd2535c3 72
19e1d359 73 webpage = self._download_webpage(url, track_id)
19e1d359 74
49f523ca
S
75 message = self._html_search_regex(
76 r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
77 webpage, 'error message', default=None)
78
dd91dfcd
YCH
79 encrypted_play_info = self._search_regex(
80 r'm-play-info="([^"]+)"', webpage, 'play info')
81 play_info = self._parse_json(
82 self._decrypt_play_info(encrypted_play_info), track_id)
49f523ca 83
dd91dfcd 84 if message and 'stream_url' not in play_info:
49f523ca
S
85 raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
86
dd91dfcd 87 song_url = play_info['stream_url']
57c7411f 88
fb6a5920 89 title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title')
57c7411f 90 thumbnail = self._proto_relative_url(self._html_search_regex(
fb6a5920 91 r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False))
57c7411f 92 uploader = self._html_search_regex(
fb6a5920 93 r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False)
57c7411f
PH
94 uploader_id = self._search_regex(
95 r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
96 description = self._og_search_description(webpage)
b80505a4
S
97 view_count = str_to_int(self._search_regex(
98 [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
fb6a5920 99 r'/listeners/?">([0-9,.]+)</a>',
29c67266 100 r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
896cc727 101 webpage, 'play count', default=None))
19e1d359
JMF
102
103 return {
104 'id': track_id,
57c7411f 105 'title': title,
c5826a49 106 'url': song_url,
57c7411f
PH
107 'description': description,
108 'thumbnail': thumbnail,
109 'uploader': uploader,
110 'uploader_id': uploader_id,
57c7411f 111 'view_count': view_count,
19e1d359 112 }
c96eca42
PH
113
114
9c250931
YCH
115class MixcloudPlaylistBaseIE(InfoExtractor):
116 _PAGE_SIZE = 24
c96eca42 117
e6da9240
YCH
118 def _find_urls_in_page(self, page):
119 for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page):
120 yield self.url_result(
121 compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)),
122 MixcloudIE.ie_key())
123
124 def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None):
125 real_page_number = real_page_number or current_page + 1
126 return self._download_webpage(
9c250931
YCH
127 'https://www.mixcloud.com/%s/' % path, video_id,
128 note='Download %s (page %d)' % (page_name, current_page + 1),
129 errnote='Unable to download %s' % page_name,
e6da9240 130 query={'page': real_page_number, 'list': 'main', '_ajax': '1'},
9c250931
YCH
131 headers={'X-Requested-With': 'XMLHttpRequest'})
132
e6da9240
YCH
133 def _tracks_page_func(self, page, video_id, page_name, current_page):
134 resp = self._fetch_tracks_page(page, video_id, page_name, current_page)
135
136 for item in self._find_urls_in_page(resp):
137 yield item
9c250931
YCH
138
139 def _get_user_description(self, page_content):
140 return self._html_search_regex(
a66e2585 141 r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>',
9c250931
YCH
142 page_content, 'user description', fatal=False)
143
144
145class MixcloudUserIE(MixcloudPlaylistBaseIE):
29c67266 146 _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
c96eca42
PH
147 IE_NAME = 'mixcloud:user'
148
149 _TESTS = [{
150 'url': 'http://www.mixcloud.com/dholbach/',
151 'info_dict': {
9c250931 152 'id': 'dholbach_uploads',
c96eca42 153 'title': 'Daniel Holbach (uploads)',
a66e2585 154 'description': 'md5:def36060ac8747b3aabca54924897e47',
c96eca42 155 },
9c250931 156 'playlist_mincount': 11,
c96eca42
PH
157 }, {
158 'url': 'http://www.mixcloud.com/dholbach/uploads/',
159 'info_dict': {
9c250931 160 'id': 'dholbach_uploads',
c96eca42 161 'title': 'Daniel Holbach (uploads)',
a66e2585 162 'description': 'md5:def36060ac8747b3aabca54924897e47',
c96eca42 163 },
9c250931 164 'playlist_mincount': 11,
c96eca42
PH
165 }, {
166 'url': 'http://www.mixcloud.com/dholbach/favorites/',
167 'info_dict': {
9c250931 168 'id': 'dholbach_favorites',
c96eca42 169 'title': 'Daniel Holbach (favorites)',
a66e2585 170 'description': 'md5:def36060ac8747b3aabca54924897e47',
c96eca42 171 },
9c250931
YCH
172 'params': {
173 'playlist_items': '1-100',
174 },
175 'playlist_mincount': 100,
c96eca42
PH
176 }, {
177 'url': 'http://www.mixcloud.com/dholbach/listens/',
178 'info_dict': {
9c250931 179 'id': 'dholbach_listens',
c96eca42 180 'title': 'Daniel Holbach (listens)',
a66e2585 181 'description': 'md5:def36060ac8747b3aabca54924897e47',
c96eca42 182 },
9c250931
YCH
183 'params': {
184 'playlist_items': '1-100',
185 },
186 'playlist_mincount': 100,
c96eca42
PH
187 }]
188
c96eca42
PH
189 def _real_extract(self, url):
190 mobj = re.match(self._VALID_URL, url)
9c250931
YCH
191 user_id = mobj.group('user')
192 list_type = mobj.group('type')
c96eca42
PH
193
194 # if only a profile URL was supplied, default to download all uploads
195 if list_type is None:
9c250931 196 list_type = 'uploads'
c96eca42 197
9c250931 198 video_id = '%s_%s' % (user_id, list_type)
c96eca42 199
9c250931
YCH
200 profile = self._download_webpage(
201 'https://www.mixcloud.com/%s/' % user_id, video_id,
202 note='Downloading user profile',
203 errnote='Unable to download user profile')
c96eca42 204
9c250931 205 username = self._og_search_title(profile)
c96eca42
PH
206 description = self._get_user_description(profile)
207
9c250931
YCH
208 entries = OnDemandPagedList(
209 functools.partial(
e6da9240 210 self._tracks_page_func,
9c250931
YCH
211 '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type),
212 self._PAGE_SIZE, use_cache=True)
c96eca42 213
9c250931
YCH
214 return self.playlist_result(
215 entries, video_id, '%s (%s)' % (username, list_type), description)
c96eca42 216
c96eca42 217
9c250931 218class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
29c67266 219 _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
c96eca42
PH
220 IE_NAME = 'mixcloud:playlist'
221
222 _TESTS = [{
223 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/',
224 'info_dict': {
9c250931 225 'id': 'RedBullThre3style_tokyo-finalists-2015',
c96eca42
PH
226 'title': 'National Champions 2015',
227 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3',
228 },
9c250931 229 'playlist_mincount': 16,
c96eca42
PH
230 }, {
231 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
a66e2585 232 'only_matching': True,
c96eca42
PH
233 }]
234
c96eca42
PH
235 def _real_extract(self, url):
236 mobj = re.match(self._VALID_URL, url)
9c250931
YCH
237 user_id = mobj.group('user')
238 playlist_id = mobj.group('playlist')
239 video_id = '%s_%s' % (user_id, playlist_id)
c96eca42 240
a66e2585 241 webpage = self._download_webpage(
9c250931
YCH
242 url, user_id,
243 note='Downloading playlist page',
244 errnote='Unable to download playlist page')
c96eca42 245
a66e2585
S
246 title = self._html_search_regex(
247 r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)',
248 webpage, 'playlist title',
249 default=None) or self._og_search_title(webpage, fatal=False)
250 description = self._get_user_description(webpage)
c96eca42 251
9c250931
YCH
252 entries = OnDemandPagedList(
253 functools.partial(
e6da9240 254 self._tracks_page_func,
9c250931
YCH
255 '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'),
256 self._PAGE_SIZE)
c96eca42 257
a66e2585 258 return self.playlist_result(entries, video_id, title, description)
e6da9240
YCH
259
260
261class MixcloudStreamIE(MixcloudPlaylistBaseIE):
29c67266 262 _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
e6da9240
YCH
263 IE_NAME = 'mixcloud:stream'
264
265 _TEST = {
266 'url': 'https://www.mixcloud.com/FirstEar/stream/',
267 'info_dict': {
268 'id': 'FirstEar',
269 'title': 'First Ear',
270 'description': 'Curators of good music\nfirstearmusic.com',
271 },
272 'playlist_mincount': 192,
273 }
274
275 def _real_extract(self, url):
276 user_id = self._match_id(url)
277
278 webpage = self._download_webpage(url, user_id)
279
280 entries = []
281 prev_page_url = None
282
283 def _handle_page(page):
284 entries.extend(self._find_urls_in_page(page))
285 return self._search_regex(
286 r'm-next-page-url="([^"]+)"', page,
287 'next page URL', default=None)
288
289 next_page_url = _handle_page(webpage)
290
291 for idx in itertools.count(0):
292 if not next_page_url or prev_page_url == next_page_url:
293 break
294
295 prev_page_url = next_page_url
296 current_page = int(self._search_regex(
297 r'\?page=(\d+)', next_page_url, 'next page number'))
298
299 next_page_url = _handle_page(self._fetch_tracks_page(
300 '%s/stream' % user_id, user_id, 'stream', idx,
301 real_page_number=current_page))
302
303 username = self._og_search_title(webpage)
304 description = self._get_user_description(webpage)
305
306 return self.playlist_result(entries, user_id, username, description)