]>
Commit | Line | Data |
---|---|---|
d0390a0c PH |
1 | from __future__ import unicode_literals |
2 | ||
9c250931 | 3 | import functools |
e6da9240 | 4 | import itertools |
80cbb6dd | 5 | import re |
80cbb6dd PH |
6 | |
7 | from .common import InfoExtractor | |
c96eca42 | 8 | from ..compat import ( |
5d7d805c | 9 | compat_b64decode, |
dd91dfcd YCH |
10 | compat_chr, |
11 | compat_ord, | |
095774e5 | 12 | compat_str, |
c96eca42 | 13 | compat_urllib_parse_unquote, |
9c250931 | 14 | compat_urlparse, |
2384f5a6 | 15 | compat_zip |
c96eca42 | 16 | ) |
1cc79574 | 17 | from ..utils import ( |
9c250931 | 18 | clean_html, |
baa7b197 | 19 | ExtractorError, |
095774e5 | 20 | int_or_none, |
9c250931 | 21 | OnDemandPagedList, |
b80505a4 | 22 | str_to_int, |
095774e5 S |
23 | try_get, |
24 | urljoin, | |
25 | ) | |
80cbb6dd PH |
26 | |
27 | ||
28 | class MixcloudIE(InfoExtractor): | |
655cb545 | 29 | _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' |
d0390a0c | 30 | IE_NAME = 'mixcloud' |
80cbb6dd | 31 | |
58ba6c01 | 32 | _TESTS = [{ |
d0390a0c | 33 | 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', |
d0390a0c | 34 | 'info_dict': { |
abb82f1d | 35 | 'id': 'dholbach-cryptkeeper', |
f896e1cc | 36 | 'ext': 'm4a', |
d0390a0c PH |
37 | 'title': 'Cryptkeeper', |
38 | 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', | |
39 | 'uploader': 'Daniel Holbach', | |
40 | 'uploader_id': 'dholbach', | |
ec85ded8 | 41 | 'thumbnail': r're:https?://.*\.jpg', |
57c7411f | 42 | 'view_count': int, |
19e1d359 | 43 | }, |
58ba6c01 S |
44 | }, { |
45 | 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', | |
46 | 'info_dict': { | |
47 | 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', | |
7a757b71 JMF |
48 | 'ext': 'mp3', |
49 | 'title': 'Caribou 7 inch Vinyl Mix & Chat', | |
58ba6c01 | 50 | 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', |
7a757b71 | 51 | 'uploader': 'Gilles Peterson Worldwide', |
58ba6c01 | 52 | 'uploader_id': 'gillespeterson', |
dd91dfcd | 53 | 'thumbnail': 're:https?://.*', |
58ba6c01 | 54 | 'view_count': int, |
58ba6c01 | 55 | }, |
655cb545 S |
56 | }, { |
57 | 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', | |
58 | 'only_matching': True, | |
58ba6c01 | 59 | }] |
80cbb6dd | 60 | |
2384f5a6 TI |
61 | @staticmethod |
62 | def _decrypt_xor_cipher(key, ciphertext): | |
63 | """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" | |
64 | return ''.join([ | |
65 | compat_chr(compat_ord(ch) ^ compat_ord(k)) | |
66 | for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) | |
67 | ||
80cbb6dd PH |
68 | def _real_extract(self, url): |
69 | mobj = re.match(self._VALID_URL, url) | |
19e1d359 JMF |
70 | uploader = mobj.group(1) |
71 | cloudcast_name = mobj.group(2) | |
c2daf8df | 72 | track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name))) |
dd2535c3 | 73 | |
19e1d359 | 74 | webpage = self._download_webpage(url, track_id) |
19e1d359 | 75 | |
2384f5a6 TI |
76 | # Legacy path |
77 | encrypted_play_info = self._search_regex( | |
78 | r'm-play-info="([^"]+)"', webpage, 'play info', default=None) | |
79 | ||
80 | if encrypted_play_info is not None: | |
81 | # Decode | |
5d7d805c | 82 | encrypted_play_info = compat_b64decode(encrypted_play_info) |
2384f5a6 TI |
83 | else: |
84 | # New path | |
85 | full_info_json = self._parse_json(self._html_search_regex( | |
095774e5 S |
86 | r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>', |
87 | webpage, 'play info'), 'play info') | |
2384f5a6 | 88 | for item in full_info_json: |
095774e5 S |
89 | item_data = try_get( |
90 | item, lambda x: x['cloudcast']['data']['cloudcastLookup'], | |
91 | dict) | |
2384f5a6 TI |
92 | if try_get(item_data, lambda x: x['streamInfo']['url']): |
93 | info_json = item_data | |
94 | break | |
95 | else: | |
96 | raise ExtractorError('Failed to extract matching stream info') | |
da20951a | 97 | |
49f523ca S |
98 | message = self._html_search_regex( |
99 | r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', | |
100 | webpage, 'error message', default=None) | |
101 | ||
2384f5a6 | 102 | js_url = self._search_regex( |
095774e5 S |
103 | r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)', |
104 | webpage, 'js url') | |
105 | js = self._download_webpage(js_url, track_id, 'Downloading JS') | |
2384f5a6 TI |
106 | # Known plaintext attack |
107 | if encrypted_play_info: | |
108 | kps = ['{"stream_url":'] | |
109 | kpa_target = encrypted_play_info | |
110 | else: | |
111 | kps = ['https://', 'http://'] | |
5d7d805c | 112 | kpa_target = compat_b64decode(info_json['streamInfo']['url']) |
2384f5a6 TI |
113 | for kp in kps: |
114 | partial_key = self._decrypt_xor_cipher(kpa_target, kp) | |
115 | for quote in ["'", '"']: | |
095774e5 S |
116 | key = self._search_regex( |
117 | r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), | |
118 | js, 'encryption key', default=None) | |
2384f5a6 TI |
119 | if key is not None: |
120 | break | |
121 | else: | |
122 | continue | |
123 | break | |
124 | else: | |
125 | raise ExtractorError('Failed to extract encryption key') | |
126 | ||
127 | if encrypted_play_info is not None: | |
128 | play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') | |
129 | if message and 'stream_url' not in play_info: | |
130 | raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) | |
131 | song_url = play_info['stream_url'] | |
132 | formats = [{ | |
133 | 'format_id': 'normal', | |
134 | 'url': song_url | |
135 | }] | |
136 | ||
137 | title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') | |
138 | thumbnail = self._proto_relative_url(self._html_search_regex( | |
139 | r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) | |
140 | uploader = self._html_search_regex( | |
141 | r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) | |
142 | uploader_id = self._search_regex( | |
143 | r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) | |
144 | description = self._og_search_description(webpage) | |
145 | view_count = str_to_int(self._search_regex( | |
146 | [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', | |
147 | r'/listeners/?">([0-9,.]+)</a>', | |
148 | r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], | |
149 | webpage, 'play count', default=None)) | |
150 | ||
151 | else: | |
152 | title = info_json['name'] | |
095774e5 S |
153 | thumbnail = urljoin( |
154 | 'https://thumbnailer.mixcloud.com/unsafe/600x600/', | |
155 | try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str)) | |
2384f5a6 TI |
156 | uploader = try_get(info_json, lambda x: x['owner']['displayName']) |
157 | uploader_id = try_get(info_json, lambda x: x['owner']['username']) | |
158 | description = try_get(info_json, lambda x: x['description']) | |
095774e5 | 159 | view_count = int_or_none(try_get(info_json, lambda x: x['plays'])) |
2384f5a6 TI |
160 | |
161 | stream_info = info_json['streamInfo'] | |
162 | formats = [] | |
095774e5 S |
163 | |
164 | for url_key in ('url', 'hlsUrl', 'dashUrl'): | |
165 | format_url = stream_info.get(url_key) | |
166 | if not format_url: | |
167 | continue | |
5d7d805c | 168 | decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url)) |
095774e5 S |
169 | if not decrypted: |
170 | continue | |
171 | if url_key == 'hlsUrl': | |
172 | formats.extend(self._extract_m3u8_formats( | |
173 | decrypted, track_id, 'mp4', entry_protocol='m3u8_native', | |
174 | m3u8_id='hls', fatal=False)) | |
175 | elif url_key == 'dashUrl': | |
176 | formats.extend(self._extract_mpd_formats( | |
177 | decrypted, track_id, mpd_id='dash', fatal=False)) | |
178 | else: | |
179 | formats.append({ | |
180 | 'format_id': 'http', | |
181 | 'url': decrypted, | |
bc5e4aa5 S |
182 | 'downloader_options': { |
183 | # Mixcloud starts throttling at >~5M | |
184 | 'http_chunk_size': 5242880, | |
185 | }, | |
095774e5 S |
186 | }) |
187 | self._sort_formats(formats) | |
19e1d359 JMF |
188 | |
189 | return { | |
190 | 'id': track_id, | |
57c7411f | 191 | 'title': title, |
2384f5a6 | 192 | 'formats': formats, |
57c7411f PH |
193 | 'description': description, |
194 | 'thumbnail': thumbnail, | |
195 | 'uploader': uploader, | |
196 | 'uploader_id': uploader_id, | |
57c7411f | 197 | 'view_count': view_count, |
19e1d359 | 198 | } |
c96eca42 PH |
199 | |
200 | ||
9c250931 YCH |
201 | class MixcloudPlaylistBaseIE(InfoExtractor): |
202 | _PAGE_SIZE = 24 | |
c96eca42 | 203 | |
e6da9240 YCH |
204 | def _find_urls_in_page(self, page): |
205 | for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page): | |
206 | yield self.url_result( | |
207 | compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)), | |
208 | MixcloudIE.ie_key()) | |
209 | ||
210 | def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None): | |
211 | real_page_number = real_page_number or current_page + 1 | |
212 | return self._download_webpage( | |
9c250931 YCH |
213 | 'https://www.mixcloud.com/%s/' % path, video_id, |
214 | note='Download %s (page %d)' % (page_name, current_page + 1), | |
215 | errnote='Unable to download %s' % page_name, | |
e6da9240 | 216 | query={'page': real_page_number, 'list': 'main', '_ajax': '1'}, |
9c250931 YCH |
217 | headers={'X-Requested-With': 'XMLHttpRequest'}) |
218 | ||
e6da9240 YCH |
219 | def _tracks_page_func(self, page, video_id, page_name, current_page): |
220 | resp = self._fetch_tracks_page(page, video_id, page_name, current_page) | |
221 | ||
222 | for item in self._find_urls_in_page(resp): | |
223 | yield item | |
9c250931 YCH |
224 | |
225 | def _get_user_description(self, page_content): | |
226 | return self._html_search_regex( | |
a66e2585 | 227 | r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>', |
9c250931 YCH |
228 | page_content, 'user description', fatal=False) |
229 | ||
230 | ||
231 | class MixcloudUserIE(MixcloudPlaylistBaseIE): | |
29c67266 | 232 | _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$' |
c96eca42 PH |
233 | IE_NAME = 'mixcloud:user' |
234 | ||
235 | _TESTS = [{ | |
236 | 'url': 'http://www.mixcloud.com/dholbach/', | |
237 | 'info_dict': { | |
9c250931 | 238 | 'id': 'dholbach_uploads', |
c96eca42 | 239 | 'title': 'Daniel Holbach (uploads)', |
a66e2585 | 240 | 'description': 'md5:def36060ac8747b3aabca54924897e47', |
c96eca42 | 241 | }, |
9c250931 | 242 | 'playlist_mincount': 11, |
c96eca42 PH |
243 | }, { |
244 | 'url': 'http://www.mixcloud.com/dholbach/uploads/', | |
245 | 'info_dict': { | |
9c250931 | 246 | 'id': 'dholbach_uploads', |
c96eca42 | 247 | 'title': 'Daniel Holbach (uploads)', |
a66e2585 | 248 | 'description': 'md5:def36060ac8747b3aabca54924897e47', |
c96eca42 | 249 | }, |
9c250931 | 250 | 'playlist_mincount': 11, |
c96eca42 PH |
251 | }, { |
252 | 'url': 'http://www.mixcloud.com/dholbach/favorites/', | |
253 | 'info_dict': { | |
9c250931 | 254 | 'id': 'dholbach_favorites', |
c96eca42 | 255 | 'title': 'Daniel Holbach (favorites)', |
a66e2585 | 256 | 'description': 'md5:def36060ac8747b3aabca54924897e47', |
c96eca42 | 257 | }, |
9c250931 YCH |
258 | 'params': { |
259 | 'playlist_items': '1-100', | |
260 | }, | |
261 | 'playlist_mincount': 100, | |
c96eca42 PH |
262 | }, { |
263 | 'url': 'http://www.mixcloud.com/dholbach/listens/', | |
264 | 'info_dict': { | |
9c250931 | 265 | 'id': 'dholbach_listens', |
c96eca42 | 266 | 'title': 'Daniel Holbach (listens)', |
a66e2585 | 267 | 'description': 'md5:def36060ac8747b3aabca54924897e47', |
c96eca42 | 268 | }, |
9c250931 YCH |
269 | 'params': { |
270 | 'playlist_items': '1-100', | |
271 | }, | |
272 | 'playlist_mincount': 100, | |
c96eca42 PH |
273 | }] |
274 | ||
c96eca42 PH |
275 | def _real_extract(self, url): |
276 | mobj = re.match(self._VALID_URL, url) | |
9c250931 YCH |
277 | user_id = mobj.group('user') |
278 | list_type = mobj.group('type') | |
c96eca42 PH |
279 | |
280 | # if only a profile URL was supplied, default to download all uploads | |
281 | if list_type is None: | |
9c250931 | 282 | list_type = 'uploads' |
c96eca42 | 283 | |
9c250931 | 284 | video_id = '%s_%s' % (user_id, list_type) |
c96eca42 | 285 | |
9c250931 YCH |
286 | profile = self._download_webpage( |
287 | 'https://www.mixcloud.com/%s/' % user_id, video_id, | |
288 | note='Downloading user profile', | |
289 | errnote='Unable to download user profile') | |
c96eca42 | 290 | |
9c250931 | 291 | username = self._og_search_title(profile) |
c96eca42 PH |
292 | description = self._get_user_description(profile) |
293 | ||
9c250931 YCH |
294 | entries = OnDemandPagedList( |
295 | functools.partial( | |
e6da9240 | 296 | self._tracks_page_func, |
9c250931 | 297 | '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type), |
6be08ce6 | 298 | self._PAGE_SIZE) |
c96eca42 | 299 | |
9c250931 YCH |
300 | return self.playlist_result( |
301 | entries, video_id, '%s (%s)' % (username, list_type), description) | |
c96eca42 | 302 | |
c96eca42 | 303 | |
9c250931 | 304 | class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): |
29c67266 | 305 | _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$' |
c96eca42 PH |
306 | IE_NAME = 'mixcloud:playlist' |
307 | ||
308 | _TESTS = [{ | |
309 | 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/', | |
310 | 'info_dict': { | |
9c250931 | 311 | 'id': 'RedBullThre3style_tokyo-finalists-2015', |
c96eca42 PH |
312 | 'title': 'National Champions 2015', |
313 | 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3', | |
314 | }, | |
9c250931 | 315 | 'playlist_mincount': 16, |
c96eca42 PH |
316 | }, { |
317 | 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', | |
a66e2585 | 318 | 'only_matching': True, |
c96eca42 PH |
319 | }] |
320 | ||
c96eca42 PH |
321 | def _real_extract(self, url): |
322 | mobj = re.match(self._VALID_URL, url) | |
9c250931 YCH |
323 | user_id = mobj.group('user') |
324 | playlist_id = mobj.group('playlist') | |
325 | video_id = '%s_%s' % (user_id, playlist_id) | |
c96eca42 | 326 | |
a66e2585 | 327 | webpage = self._download_webpage( |
9c250931 YCH |
328 | url, user_id, |
329 | note='Downloading playlist page', | |
330 | errnote='Unable to download playlist page') | |
c96eca42 | 331 | |
a66e2585 S |
332 | title = self._html_search_regex( |
333 | r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)', | |
334 | webpage, 'playlist title', | |
335 | default=None) or self._og_search_title(webpage, fatal=False) | |
336 | description = self._get_user_description(webpage) | |
c96eca42 | 337 | |
9c250931 YCH |
338 | entries = OnDemandPagedList( |
339 | functools.partial( | |
e6da9240 | 340 | self._tracks_page_func, |
9c250931 YCH |
341 | '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'), |
342 | self._PAGE_SIZE) | |
c96eca42 | 343 | |
a66e2585 | 344 | return self.playlist_result(entries, video_id, title, description) |
e6da9240 YCH |
345 | |
346 | ||
347 | class MixcloudStreamIE(MixcloudPlaylistBaseIE): | |
29c67266 | 348 | _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$' |
e6da9240 YCH |
349 | IE_NAME = 'mixcloud:stream' |
350 | ||
351 | _TEST = { | |
352 | 'url': 'https://www.mixcloud.com/FirstEar/stream/', | |
353 | 'info_dict': { | |
354 | 'id': 'FirstEar', | |
355 | 'title': 'First Ear', | |
356 | 'description': 'Curators of good music\nfirstearmusic.com', | |
357 | }, | |
358 | 'playlist_mincount': 192, | |
359 | } | |
360 | ||
361 | def _real_extract(self, url): | |
362 | user_id = self._match_id(url) | |
363 | ||
364 | webpage = self._download_webpage(url, user_id) | |
365 | ||
366 | entries = [] | |
367 | prev_page_url = None | |
368 | ||
369 | def _handle_page(page): | |
370 | entries.extend(self._find_urls_in_page(page)) | |
371 | return self._search_regex( | |
372 | r'm-next-page-url="([^"]+)"', page, | |
373 | 'next page URL', default=None) | |
374 | ||
375 | next_page_url = _handle_page(webpage) | |
376 | ||
377 | for idx in itertools.count(0): | |
378 | if not next_page_url or prev_page_url == next_page_url: | |
379 | break | |
380 | ||
381 | prev_page_url = next_page_url | |
382 | current_page = int(self._search_regex( | |
383 | r'\?page=(\d+)', next_page_url, 'next page number')) | |
384 | ||
385 | next_page_url = _handle_page(self._fetch_tracks_page( | |
386 | '%s/stream' % user_id, user_id, 'stream', idx, | |
387 | real_page_number=current_page)) | |
388 | ||
389 | username = self._og_search_title(webpage) | |
390 | description = self._get_user_description(webpage) | |
391 | ||
392 | return self.playlist_result(entries, user_id, username, description) |