]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/mixcloud.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / mixcloud.py
1 import itertools
2
3 from .common import InfoExtractor
4 from ..compat import (
5 compat_b64decode,
6 compat_ord,
7 compat_str,
8 compat_urllib_parse_unquote,
9 )
10 from ..utils import (
11 ExtractorError,
12 int_or_none,
13 parse_iso8601,
14 strip_or_none,
15 try_get,
16 )
17
18
19 class MixcloudBaseIE(InfoExtractor):
20 def _call_api(self, object_type, object_fields, display_id, username, slug=None):
21 lookup_key = object_type + 'Lookup'
22 return self._download_json(
23 'https://app.mixcloud.com/graphql', display_id, query={
24 'query': '''{
25 %s(lookup: {username: "%s"%s}) {
26 %s
27 }
28 }''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields)
29 })['data'][lookup_key]
30
31
32 class MixcloudIE(MixcloudBaseIE):
33 _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
34 IE_NAME = 'mixcloud'
35
36 _TESTS = [{
37 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
38 'info_dict': {
39 'id': 'dholbach_cryptkeeper',
40 'ext': 'm4a',
41 'title': 'Cryptkeeper',
42 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
43 'uploader': 'Daniel Holbach',
44 'uploader_id': 'dholbach',
45 'thumbnail': r're:https?://.*\.jpg',
46 'view_count': int,
47 'timestamp': 1321359578,
48 'upload_date': '20111115',
49 'uploader_url': 'https://www.mixcloud.com/dholbach/',
50 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills',
51 'duration': 3723,
52 'tags': [],
53 'comment_count': int,
54 'repost_count': int,
55 'like_count': int,
56 },
57 'params': {'skip_download': 'm3u8'},
58 }, {
59 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
60 'info_dict': {
61 'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
62 'ext': 'mp3',
63 'title': 'Caribou 7 inch Vinyl Mix & Chat',
64 'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
65 'uploader': 'Gilles Peterson Worldwide',
66 'uploader_id': 'gillespeterson',
67 'thumbnail': 're:https?://.*',
68 'view_count': int,
69 'timestamp': 1422987057,
70 'upload_date': '20150203',
71 'uploader_url': 'https://www.mixcloud.com/gillespeterson/',
72 'duration': 2992,
73 'tags': [],
74 'comment_count': int,
75 'repost_count': int,
76 'like_count': int,
77 },
78 'params': {'skip_download': '404 playback error on site'},
79 }, {
80 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
81 'only_matching': True,
82 }]
83 _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'
84
85 @staticmethod
86 def _decrypt_xor_cipher(key, ciphertext):
87 """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
88 return ''.join([
89 chr(compat_ord(ch) ^ compat_ord(k))
90 for ch, k in zip(ciphertext, itertools.cycle(key))])
91
92 def _real_extract(self, url):
93 username, slug = self._match_valid_url(url).groups()
94 username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
95 track_id = '%s_%s' % (username, slug)
96
97 cloudcast = self._call_api('cloudcast', '''audioLength
98 comments(first: 100) {
99 edges {
100 node {
101 comment
102 created
103 user {
104 displayName
105 username
106 }
107 }
108 }
109 totalCount
110 }
111 description
112 favorites {
113 totalCount
114 }
115 featuringArtistList
116 isExclusive
117 name
118 owner {
119 displayName
120 url
121 username
122 }
123 picture(width: 1024, height: 1024) {
124 url
125 }
126 plays
127 publishDate
128 reposts {
129 totalCount
130 }
131 streamInfo {
132 dashUrl
133 hlsUrl
134 url
135 }
136 tags {
137 tag {
138 name
139 }
140 }
141 restrictedReason
142 id''', track_id, username, slug)
143
144 if not cloudcast:
145 raise ExtractorError('Track not found', expected=True)
146
147 reason = cloudcast.get('restrictedReason')
148 if reason == 'tracklist':
149 raise ExtractorError('Track unavailable in your country due to licensing restrictions', expected=True)
150 elif reason == 'repeat_play':
151 raise ExtractorError('You have reached your play limit for this track', expected=True)
152 elif reason:
153 raise ExtractorError('Track is restricted', expected=True)
154
155 title = cloudcast['name']
156
157 stream_info = cloudcast['streamInfo']
158 formats = []
159
160 for url_key in ('url', 'hlsUrl', 'dashUrl'):
161 format_url = stream_info.get(url_key)
162 if not format_url:
163 continue
164 decrypted = self._decrypt_xor_cipher(
165 self._DECRYPTION_KEY, compat_b64decode(format_url))
166 if url_key == 'hlsUrl':
167 formats.extend(self._extract_m3u8_formats(
168 decrypted, track_id, 'mp4', entry_protocol='m3u8_native',
169 m3u8_id='hls', fatal=False))
170 elif url_key == 'dashUrl':
171 formats.extend(self._extract_mpd_formats(
172 decrypted, track_id, mpd_id='dash', fatal=False))
173 else:
174 formats.append({
175 'format_id': 'http',
176 'url': decrypted,
177 'vcodec': 'none',
178 'downloader_options': {
179 # Mixcloud starts throttling at >~5M
180 'http_chunk_size': 5242880,
181 },
182 })
183
184 if not formats and cloudcast.get('isExclusive'):
185 self.raise_login_required(metadata_available=True)
186
187 comments = []
188 for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []):
189 node = edge.get('node') or {}
190 text = strip_or_none(node.get('comment'))
191 if not text:
192 continue
193 user = node.get('user') or {}
194 comments.append({
195 'author': user.get('displayName'),
196 'author_id': user.get('username'),
197 'text': text,
198 'timestamp': parse_iso8601(node.get('created')),
199 })
200
201 tags = []
202 for t in cloudcast.get('tags'):
203 tag = try_get(t, lambda x: x['tag']['name'], compat_str)
204 if not tag:
205 tags.append(tag)
206
207 get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount']))
208
209 owner = cloudcast.get('owner') or {}
210
211 return {
212 'id': track_id,
213 'title': title,
214 'formats': formats,
215 'description': cloudcast.get('description'),
216 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str),
217 'uploader': owner.get('displayName'),
218 'timestamp': parse_iso8601(cloudcast.get('publishDate')),
219 'uploader_id': owner.get('username'),
220 'uploader_url': owner.get('url'),
221 'duration': int_or_none(cloudcast.get('audioLength')),
222 'view_count': int_or_none(cloudcast.get('plays')),
223 'like_count': get_count('favorites'),
224 'repost_count': get_count('reposts'),
225 'comment_count': get_count('comments'),
226 'comments': comments,
227 'tags': tags,
228 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None,
229 }
230
231
232 class MixcloudPlaylistBaseIE(MixcloudBaseIE):
233 def _get_cloudcast(self, node):
234 return node
235
236 def _get_playlist_title(self, title, slug):
237 return title
238
239 def _real_extract(self, url):
240 username, slug = self._match_valid_url(url).groups()
241 username = compat_urllib_parse_unquote(username)
242 if not slug:
243 slug = 'uploads'
244 else:
245 slug = compat_urllib_parse_unquote(slug)
246 playlist_id = '%s_%s' % (username, slug)
247
248 is_playlist_type = self._ROOT_TYPE == 'playlist'
249 playlist_type = 'items' if is_playlist_type else slug
250 list_filter = ''
251
252 has_next_page = True
253 entries = []
254 while has_next_page:
255 playlist = self._call_api(
256 self._ROOT_TYPE, '''%s
257 %s
258 %s(first: 100%s) {
259 edges {
260 node {
261 %s
262 }
263 }
264 pageInfo {
265 endCursor
266 hasNextPage
267 }
268 }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE),
269 playlist_id, username, slug if is_playlist_type else None)
270
271 items = playlist.get(playlist_type) or {}
272 for edge in items.get('edges', []):
273 cloudcast = self._get_cloudcast(edge.get('node') or {})
274 cloudcast_url = cloudcast.get('url')
275 if not cloudcast_url:
276 continue
277 item_slug = try_get(cloudcast, lambda x: x['slug'], compat_str)
278 owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str)
279 video_id = f'{owner_username}_{item_slug}' if item_slug and owner_username else None
280 entries.append(self.url_result(
281 cloudcast_url, MixcloudIE.ie_key(), video_id))
282
283 page_info = items['pageInfo']
284 has_next_page = page_info['hasNextPage']
285 list_filter = ', after: "%s"' % page_info['endCursor']
286
287 return self.playlist_result(
288 entries, playlist_id,
289 self._get_playlist_title(playlist[self._TITLE_KEY], slug),
290 playlist.get(self._DESCRIPTION_KEY))
291
292
293 class MixcloudUserIE(MixcloudPlaylistBaseIE):
294 _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$'
295 IE_NAME = 'mixcloud:user'
296
297 _TESTS = [{
298 'url': 'http://www.mixcloud.com/dholbach/',
299 'info_dict': {
300 'id': 'dholbach_uploads',
301 'title': 'Daniel Holbach (uploads)',
302 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
303 },
304 'playlist_mincount': 36,
305 }, {
306 'url': 'http://www.mixcloud.com/dholbach/uploads/',
307 'info_dict': {
308 'id': 'dholbach_uploads',
309 'title': 'Daniel Holbach (uploads)',
310 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
311 },
312 'playlist_mincount': 36,
313 }, {
314 'url': 'http://www.mixcloud.com/dholbach/favorites/',
315 'info_dict': {
316 'id': 'dholbach_favorites',
317 'title': 'Daniel Holbach (favorites)',
318 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
319 },
320 # 'params': {
321 # 'playlist_items': '1-100',
322 # },
323 'playlist_mincount': 396,
324 }, {
325 'url': 'http://www.mixcloud.com/dholbach/listens/',
326 'info_dict': {
327 'id': 'dholbach_listens',
328 'title': 'Daniel Holbach (listens)',
329 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
330 },
331 # 'params': {
332 # 'playlist_items': '1-100',
333 # },
334 'playlist_mincount': 1623,
335 'skip': 'Large list',
336 }, {
337 'url': 'https://www.mixcloud.com/FirstEar/stream/',
338 'info_dict': {
339 'id': 'FirstEar_stream',
340 'title': 'First Ear (stream)',
341 'description': 'we maraud for ears',
342 },
343 'playlist_mincount': 269,
344 }]
345
346 _TITLE_KEY = 'displayName'
347 _DESCRIPTION_KEY = 'biog'
348 _ROOT_TYPE = 'user'
349 _NODE_TEMPLATE = '''slug
350 url
351 owner { username }'''
352
353 def _get_playlist_title(self, title, slug):
354 return '%s (%s)' % (title, slug)
355
356
357 class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
358 _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
359 IE_NAME = 'mixcloud:playlist'
360
361 _TESTS = [{
362 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
363 'info_dict': {
364 'id': 'maxvibes_jazzcat-on-ness-radio',
365 'title': 'Ness Radio sessions',
366 },
367 'playlist_mincount': 59,
368 }]
369 _TITLE_KEY = 'name'
370 _DESCRIPTION_KEY = 'description'
371 _ROOT_TYPE = 'playlist'
372 _NODE_TEMPLATE = '''cloudcast {
373 slug
374 url
375 owner { username }
376 }'''
377
378 def _get_cloudcast(self, node):
379 return node.get('cloudcast') or {}