]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bandcamp.py
[bandcamp:weekly] Add extractor
[yt-dlp.git] / youtube_dl / extractor / bandcamp.py
CommitLineData
3798eadc
PH
1from __future__ import unicode_literals
2
45aef472 3import json
0aacd2de 4import random
45aef472 5import re
0aacd2de 6import time
45aef472
PH
7
8from .common import InfoExtractor
1cc79574 9from ..compat import (
cffa6aa1 10 compat_str,
09804265 11 compat_urlparse,
1cc79574
PH
12)
13from ..utils import (
45aef472 14 ExtractorError,
ba717dca
S
15 float_or_none,
16 int_or_none,
0aacd2de
S
17 parse_filesize,
18 unescapeHTML,
19 update_url_query,
62bafabc 20 unified_strdate,
45aef472
PH
21)
22
23
24class BandcampIE(InfoExtractor):
b48f147d 25 _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
cffa6aa1 26 _TESTS = [{
3798eadc 27 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
3798eadc
PH
28 'md5': 'c557841d5e50261777a6585648adf439',
29 'info_dict': {
d9bf4652
S
30 'id': '1812978515',
31 'ext': 'mp3',
32 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
33 'duration': 9.8485,
6f5ac90c 34 },
3798eadc 35 '_skip': 'There is a limit of 200 free downloads / month for the test song'
d9bf4652
S
36 }, {
37 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
0f63dc24 38 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
d9bf4652
S
39 'info_dict': {
40 'id': '2650410135',
0f63dc24
TF
41 'ext': 'aiff',
42 'title': 'Ben Prunty - Lanius (Battle)',
43 'uploader': 'Ben Prunty',
d9bf4652 44 },
cffa6aa1 45 }]
45aef472
PH
46
47 def _real_extract(self, url):
48 mobj = re.match(self._VALID_URL, url)
49 title = mobj.group('title')
50 webpage = self._download_webpage(url, title)
8b4774dc 51 thumbnail = self._html_search_meta('og:image', webpage, default=None)
45aef472 52 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
79981f03 53 if not m_download:
cffa6aa1 54 m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
5ecd3c6a
PH
55 if m_trackinfo:
56 json_code = m_trackinfo.group(1)
79981f03 57 data = json.loads(json_code)[0]
70346165
T
58 track_id = compat_str(data['id'])
59
60 if not data.get('file'):
61 raise ExtractorError('Not streamable', video_id=track_id, expected=True)
5ecd3c6a 62
5ecd3c6a 63 formats = []
79981f03 64 for format_id, format_url in data['file'].items():
2902d44f 65 ext, abr_str = format_id.split('-', 1)
5ecd3c6a
PH
66 formats.append({
67 'format_id': format_id,
1e52776a 68 'url': self._proto_relative_url(format_url, 'http:'),
79981f03 69 'ext': ext,
5ecd3c6a 70 'vcodec': 'none',
79981f03 71 'acodec': ext,
ba717dca 72 'abr': int_or_none(abr_str),
5ecd3c6a
PH
73 })
74
75 self._sort_formats(formats)
cffa6aa1 76
d35dc6d3 77 return {
70346165 78 'id': track_id,
79981f03 79 'title': data['title'],
8b4774dc 80 'thumbnail': thumbnail,
cffa6aa1 81 'formats': formats,
ba717dca 82 'duration': float_or_none(data.get('duration')),
d35dc6d3 83 }
5ecd3c6a 84 else:
3798eadc 85 raise ExtractorError('No free songs found')
45aef472
PH
86
87 download_link = m_download.group(1)
d9bf4652 88 video_id = self._search_regex(
b524a001 89 r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
99c2398b 90 webpage, 'video id')
45aef472 91
0aacd2de
S
92 download_webpage = self._download_webpage(
93 download_link, video_id, 'Downloading free downloads page')
94
95 blob = self._parse_json(
96 self._search_regex(
97 r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
98 'blob', group='blob'),
99 video_id, transform_source=unescapeHTML)
100
101 info = blob['digital_items'][0]
102
103 downloads = info['downloads']
104 track = info['title']
105
106 artist = info.get('artist')
107 title = '%s - %s' % (artist, track) if artist else track
108
109 download_formats = {}
110 for f in blob['download_formats']:
111 name, ext = f.get('name'), f.get('file_extension')
112 if all(isinstance(x, compat_str) for x in (name, ext)):
113 download_formats[name] = ext.strip('.')
114
115 formats = []
116 for format_id, f in downloads.items():
117 format_url = f.get('url')
118 if not format_url:
119 continue
120 # Stat URL generation algorithm is reverse engineered from
121 # download_*_bundle_*.js
122 stat_url = update_url_query(
123 format_url.replace('/download/', '/statdownload/'), {
124 '.rand': int(time.time() * 1000 * random.random()),
125 })
126 format_id = f.get('encoding_name') or format_id
127 stat = self._download_json(
128 stat_url, video_id, 'Downloading %s JSON' % format_id,
129 transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
130 fatal=False)
131 if not stat:
132 continue
133 retry_url = stat.get('retry_url')
134 if not isinstance(retry_url, compat_str):
135 continue
136 formats.append({
137 'url': self._proto_relative_url(retry_url, 'http:'),
138 'ext': download_formats.get(format_id),
139 'format_id': format_id,
140 'format_note': f.get('description'),
141 'filesize': parse_filesize(f.get('size_mb')),
142 'vcodec': 'none',
143 })
144 self._sort_formats(formats)
45aef472 145
5ecd3c6a
PH
146 return {
147 'id': video_id,
0aacd2de 148 'title': title,
8b4774dc 149 'thumbnail': info.get('thumb_url') or thumbnail,
f8b5ab8c 150 'uploader': info.get('artist'),
0aacd2de
S
151 'artist': artist,
152 'track': track,
153 'formats': formats,
5ecd3c6a 154 }
09804265
JMF
155
156
157class BandcampAlbumIE(InfoExtractor):
3798eadc 158 IE_NAME = 'Bandcamp:album'
72c1f8de 159 _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))'
09804265 160
22a6f150 161 _TESTS = [{
3798eadc
PH
162 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
163 'playlist': [
d35dc6d3 164 {
3798eadc
PH
165 'md5': '39bc1eded3476e927c724321ddf116cf',
166 'info_dict': {
13ba3a64
PH
167 'id': '1353101989',
168 'ext': 'mp3',
3798eadc 169 'title': 'Intro',
d35dc6d3
JMF
170 }
171 },
172 {
3798eadc
PH
173 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
174 'info_dict': {
13ba3a64
PH
175 'id': '38097443',
176 'ext': 'mp3',
3798eadc 177 'title': 'Kero One - Keep It Alive (Blazo remix)',
d35dc6d3
JMF
178 }
179 },
180 ],
13ba3a64
PH
181 'info_dict': {
182 'title': 'Jazz Format Mixtape vol.1',
72c1f8de
PH
183 'id': 'jazz-format-mixtape-vol-1',
184 'uploader_id': 'blazo',
13ba3a64 185 },
3798eadc
PH
186 'params': {
187 'playlistend': 2
d35dc6d3 188 },
72c1f8de 189 'skip': 'Bandcamp imposes download limits.'
22a6f150
PH
190 }, {
191 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
192 'info_dict': {
193 'title': 'Hierophany of the Open Grave',
72c1f8de
PH
194 'uploader_id': 'nightbringer',
195 'id': 'hierophany-of-the-open-grave',
22a6f150
PH
196 },
197 'playlist_mincount': 9,
1fa17469
S
198 }, {
199 'url': 'http://dotscale.bandcamp.com',
200 'info_dict': {
201 'title': 'Loom',
72c1f8de
PH
202 'id': 'dotscale',
203 'uploader_id': 'dotscale',
1fa17469
S
204 },
205 'playlist_mincount': 7,
64fc49ab
S
206 }, {
207 # with escaped quote in title
208 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
209 'info_dict': {
210 'title': '"Entropy" EP',
211 'uploader_id': 'jstrecords',
212 'id': 'entropy-ep',
213 },
214 'playlist_mincount': 3,
019f4c03
YCH
215 }, {
216 # not all tracks have songs
217 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
218 'info_dict': {
219 'id': 'we-are-the-plague',
220 'title': 'WE ARE THE PLAGUE',
221 'uploader_id': 'insulters',
222 },
223 'playlist_count': 2,
22a6f150 224 }]
d35dc6d3 225
62bafabc
AV
226 @classmethod
227 def suitable(cls, url):
228 return False if BandcampWeeklyIE.suitable(url) else super(BandcampAlbumIE, cls).suitable(url)
229
09804265
JMF
230 def _real_extract(self, url):
231 mobj = re.match(self._VALID_URL, url)
72c1f8de
PH
232 uploader_id = mobj.group('subdomain')
233 album_id = mobj.group('album_id')
234 playlist_id = album_id or uploader_id
235 webpage = self._download_webpage(url, playlist_id)
019f4c03
YCH
236 track_elements = re.findall(
237 r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
238 if not track_elements:
3798eadc 239 raise ExtractorError('The page doesn\'t contain any tracks')
019f4c03 240 # Only tracks with duration info have songs
09804265
JMF
241 entries = [
242 self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
019f4c03
YCH
243 for elem_content, t_path in track_elements
244 if self._html_search_meta('duration', elem_content, default=None)]
245
64fc49ab
S
246 title = self._html_search_regex(
247 r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
248 webpage, 'title', fatal=False)
249 if title:
250 title = title.replace(r'\"', '"')
09804265
JMF
251 return {
252 '_type': 'playlist',
72c1f8de 253 'uploader_id': uploader_id,
b48f147d 254 'id': playlist_id,
09804265
JMF
255 'title': title,
256 'entries': entries,
257 }
62bafabc
AV
258
259
260class BandcampWeeklyIE(InfoExtractor):
261 IE_NAME = 'Bandcamp:bandcamp_weekly'
262 _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*&)?show=(?P<id>\d+)(?:$|[&#])'
263 _TESTS = [{
264 'url': 'https://bandcamp.com/?show=224',
265 'md5': 'b00df799c733cf7e0c567ed187dea0fd',
266 'info_dict': {
267 'id': '224',
268 'ext': 'opus',
269 'title': 'BC Weekly April 4th 2017: Magic Moments',
270 'description': 'Stones Throw\'s Vex Ruffin, plus up and coming singer Salami Rose Joe Louis, in conversation about their fantastic DIY albums.',
271 }
272 }, {
273 'url': 'https://bandcamp.com/?blah/blah@&show=228',
274 'only_matching': True
275 }]
276
277 def _real_extract(self, url):
278 video_id = self._match_id(url)
279 webpage = self._download_webpage(url, video_id)
280
281 blob = self._parse_json(
282 self._search_regex(
283 r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
284 'blob', group='blob'),
285 video_id, transform_source=unescapeHTML)
286
287 show = blob['bcw_show']
288
289 # This is desired because any invalid show id redirects to `bandcamp.com`
290 # which happens to expose the latest Bandcamp Weekly episode.
291 video_id = compat_str(show['show_id'])
292
293 def to_format_dictionaries(audio_stream):
294 dictionaries = [{'format_id': kvp[0], 'url': kvp[1]} for kvp in audio_stream.items()]
295 known_extensions = ['mp3', 'opus']
296
297 for dictionary in dictionaries:
298 for ext in known_extensions:
299 if ext in dictionary['format_id']:
300 dictionary['ext'] = ext
301 break
302
303 return dictionaries
304
305 formats = to_format_dictionaries(show['audio_stream'])
306 self._sort_formats(formats)
307
308 return {
309 'id': video_id,
310 'title': show['audio_title'] + ': ' + show['subtitle'],
311 'description': show.get('desc'),
312 'duration': float_or_none(show.get('audio_duration')),
313 'webpage_url': 'https://bandcamp.com/?show=' + video_id,
314 'is_live': False,
315 'release_date': unified_strdate(show.get('published_date')),
316 'series': 'Bandcamp Weekly',
317 'episode_id': compat_str(video_id),
318 'formats': formats
319 }