]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bandcamp.py
[ie/youtube] Suppress "Unavailable videos are hidden" warning (#10159)
[yt-dlp.git] / yt_dlp / extractor / bandcamp.py
CommitLineData
0aacd2de 1import random
45aef472 2import re
0aacd2de 3import time
45aef472
PH
4
5from .common import InfoExtractor
1cc79574 6from ..utils import (
304ad45a 7 KNOWN_EXTENSIONS,
45aef472 8 ExtractorError,
ba717dca
S
9 float_or_none,
10 int_or_none,
0aacd2de 11 parse_filesize,
4991e16c
S
12 str_or_none,
13 try_get,
62bafabc 14 unified_strdate,
4991e16c 15 unified_timestamp,
304ad45a 16 update_url_query,
3052a30d 17 url_or_none,
8bdd16b4 18 urljoin,
45aef472
PH
19)
20
21
8bdd16b4 22class BandcampIE(InfoExtractor):
2c475e48 23 _VALID_URL = r'https?://(?P<uploader>[^/]+)\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
bfd973ec 24 _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
cffa6aa1 25 _TESTS = [{
3467b3e2 26 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
3798eadc
PH
27 'md5': 'c557841d5e50261777a6585648adf439',
28 'info_dict': {
d9bf4652
S
29 'id': '1812978515',
30 'ext': 'mp3',
a0e526ed 31 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
d9bf4652 32 'duration': 9.8485,
a0e526ed 33 'uploader': 'youtube-dl "\'/\\ä↭',
3a379e5e 34 'upload_date': '20121129',
8bdd16b4 35 'timestamp': 1354224127,
a0e526ed
SA
36 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
37 'album_artist': 'youtube-dl "\'/\\ä↭',
38 'track_id': '1812978515',
39 'artist': 'youtube-dl "\'/\\ä↭',
40 'uploader_url': 'https://youtube-dl.bandcamp.com',
41 'uploader_id': 'youtube-dl',
42 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
6f5ac90c 43 },
add96eb9 44 '_skip': 'There is a limit of 200 free downloads / month for the test song',
d9bf4652 45 }, {
4991e16c 46 # free download
d9bf4652 47 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
d9bf4652
S
48 'info_dict': {
49 'id': '2650410135',
a0e526ed
SA
50 'ext': 'm4a',
51 'acodec': r're:[fa]lac',
0f63dc24 52 'title': 'Ben Prunty - Lanius (Battle)',
4991e16c 53 'thumbnail': r're:^https?://.*\.jpg$',
0f63dc24 54 'uploader': 'Ben Prunty',
4991e16c
S
55 'timestamp': 1396508491,
56 'upload_date': '20140403',
10db0d2f 57 'release_timestamp': 1396483200,
4991e16c
S
58 'release_date': '20140403',
59 'duration': 260.877,
60 'track': 'Lanius (Battle)',
61 'track_number': 1,
62 'track_id': '2650410135',
63 'artist': 'Ben Prunty',
a0e526ed 64 'album_artist': 'Ben Prunty',
4991e16c 65 'album': 'FTL: Advanced Edition Soundtrack',
a0e526ed
SA
66 'uploader_url': 'https://benprunty.bandcamp.com',
67 'uploader_id': 'benprunty',
d9bf4652 68 },
14b7a24c 69 }, {
4991e16c 70 # no free download, mp3 128
14b7a24c 71 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
4991e16c 72 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
14b7a24c
PV
73 'info_dict': {
74 'id': '2584466013',
75 'ext': 'mp3',
4991e16c
S
76 'title': 'Mastodon - Hail to Fire',
77 'thumbnail': r're:^https?://.*\.jpg$',
78 'uploader': 'Mastodon',
79 'timestamp': 1322005399,
80 'upload_date': '20111122',
10db0d2f 81 'release_timestamp': 1076112000,
4991e16c
S
82 'release_date': '20040207',
83 'duration': 120.79,
84 'track': 'Hail to Fire',
14b7a24c 85 'track_number': 5,
4991e16c
S
86 'track_id': '2584466013',
87 'artist': 'Mastodon',
a0e526ed 88 'album_artist': 'Mastodon',
4991e16c 89 'album': 'Call of the Mastodon',
a0e526ed
SA
90 'uploader_url': 'https://relapsealumni.bandcamp.com',
91 'uploader_id': 'relapsealumni',
92 },
93 }, {
94 # track from compilation album (artist/album_artist difference)
95 'url': 'https://diskotopia.bandcamp.com/track/safehouse',
96 'md5': '19c5337bca1428afa54129f86a2f6a69',
97 'info_dict': {
98 'id': '1978174799',
99 'ext': 'mp3',
100 'title': 'submerse - submerse - Safehouse',
101 'thumbnail': r're:^https?://.*\.jpg$',
102 'uploader': 'submerse',
103 'timestamp': 1480779297,
104 'upload_date': '20161203',
105 'release_timestamp': 1481068800,
106 'release_date': '20161207',
107 'duration': 154.066,
108 'track': 'submerse - Safehouse',
109 'track_number': 3,
110 'track_id': '1978174799',
111 'artist': 'submerse',
112 'album_artist': 'Diskotopia',
113 'album': 'DSK F/W 2016-2017 Free Compilation',
114 'uploader_url': 'https://diskotopia.bandcamp.com',
115 'uploader_id': 'diskotopia',
14b7a24c 116 },
cffa6aa1 117 }]
45aef472 118
8bdd16b4 119 def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
120 return self._parse_json(self._html_search_regex(
add96eb9 121 rf'data-{attr}=(["\'])({{.+?}})\1', webpage,
8bdd16b4 122 attr + ' data', group=2), video_id, fatal=fatal)
123
45aef472 124 def _real_extract(self, url):
2c475e48 125 title, uploader = self._match_valid_url(url).group('id', 'uploader')
45aef472 126 webpage = self._download_webpage(url, title)
8bdd16b4 127 tralbum = self._extract_data_attr(webpage, title)
128 thumbnail = self._og_search_thumbnail(webpage)
129
130 track_id = None
131 track = None
132 track_number = None
133 duration = None
134
135 formats = []
136 track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
137 if track_info:
138 file_ = track_info.get('file')
139 if isinstance(file_, dict):
140 for format_id, format_url in file_.items():
141 if not url_or_none(format_url):
142 continue
143 ext, abr_str = format_id.split('-', 1)
144 formats.append({
145 'format_id': format_id,
146 'url': self._proto_relative_url(format_url, 'http:'),
147 'ext': ext,
148 'vcodec': 'none',
149 'acodec': ext,
150 'abr': int_or_none(abr_str),
151 })
152 track = track_info.get('title')
153 track_id = str_or_none(
154 track_info.get('track_id') or track_info.get('id'))
155 track_number = int_or_none(track_info.get('track_num'))
156 duration = float_or_none(track_info.get('duration'))
157
158 embed = self._extract_data_attr(webpage, title, 'embed', False)
159 current = tralbum.get('current') or {}
160 artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
a0e526ed
SA
161 album_artist = self._html_search_regex(
162 r'<h3 class="albumTitle">[\S\s]*?by\s*<span>\s*<a href="[^>]+">\s*([^>]+?)\s*</a>',
163 webpage, 'album artist', fatal=False)
8bdd16b4 164 timestamp = unified_timestamp(
165 current.get('publish_date') or tralbum.get('album_publish_date'))
166
167 download_link = tralbum.get('freeDownloadPage')
4991e16c 168 if download_link:
add96eb9 169 track_id = str(tralbum['id'])
4991e16c
S
170
171 download_webpage = self._download_webpage(
172 download_link, track_id, 'Downloading free downloads page')
173
8bdd16b4 174 blob = self._extract_data_attr(download_webpage, track_id, 'blob')
4991e16c
S
175
176 info = try_get(
177 blob, (lambda x: x['digital_items'][0],
178 lambda x: x['download_items'][0]), dict)
179 if info:
180 downloads = info.get('downloads')
181 if isinstance(downloads, dict):
8bdd16b4 182 if not track:
183 track = info.get('title')
4991e16c
S
184 if not artist:
185 artist = info.get('artist')
186 if not thumbnail:
187 thumbnail = info.get('thumb_url')
188
189 download_formats = {}
190 download_formats_list = blob.get('download_formats')
191 if isinstance(download_formats_list, list):
192 for f in blob['download_formats']:
193 name, ext = f.get('name'), f.get('file_extension')
add96eb9 194 if all(isinstance(x, str) for x in (name, ext)):
4991e16c
S
195 download_formats[name] = ext.strip('.')
196
197 for format_id, f in downloads.items():
198 format_url = f.get('url')
199 if not format_url:
200 continue
201 # Stat URL generation algorithm is reverse engineered from
202 # download_*_bundle_*.js
203 stat_url = update_url_query(
204 format_url.replace('/download/', '/statdownload/'), {
205 '.rand': int(time.time() * 1000 * random.random()),
206 })
207 format_id = f.get('encoding_name') or format_id
208 stat = self._download_json(
add96eb9 209 stat_url, track_id, f'Downloading {format_id} JSON',
4991e16c
S
210 transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
211 fatal=False)
212 if not stat:
213 continue
214 retry_url = url_or_none(stat.get('retry_url'))
215 if not retry_url:
216 continue
8bdd16b4 217 formats.append({
4991e16c
S
218 'url': self._proto_relative_url(retry_url, 'http:'),
219 'ext': download_formats.get(format_id),
220 'format_id': format_id,
221 'format_note': f.get('description'),
222 'filesize': parse_filesize(f.get('size_mb')),
223 'vcodec': 'none',
933dbf5a 224 'acodec': format_id.split('-')[0],
4991e16c 225 })
5ecd3c6a 226
add96eb9 227 title = f'{artist} - {track}' if artist else track
8bdd16b4 228
229 if not duration:
230 duration = float_or_none(self._html_search_meta(
231 'duration', webpage, default=None))
45aef472 232
5ecd3c6a 233 return {
8bdd16b4 234 'id': track_id,
235 'title': title,
4991e16c 236 'thumbnail': thumbnail,
8bdd16b4 237 'uploader': artist,
2c475e48 238 'uploader_id': uploader,
239 'uploader_url': f'https://{uploader}.bandcamp.com',
4991e16c 240 'timestamp': timestamp,
10db0d2f 241 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
8bdd16b4 242 'duration': duration,
243 'track': track,
244 'track_number': track_number,
245 'track_id': track_id,
246 'artist': artist,
247 'album': embed.get('album_title'),
a0e526ed 248 'album_artist': album_artist,
8bdd16b4 249 'formats': formats,
5ecd3c6a 250 }
09804265
JMF
251
252
6368e2e6 253class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
3798eadc 254 IE_NAME = 'Bandcamp:album'
85a0ad01 255 _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)'
09804265 256
22a6f150 257 _TESTS = [{
3798eadc
PH
258 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
259 'playlist': [
d35dc6d3 260 {
3798eadc
PH
261 'md5': '39bc1eded3476e927c724321ddf116cf',
262 'info_dict': {
13ba3a64
PH
263 'id': '1353101989',
264 'ext': 'mp3',
8bdd16b4 265 'title': 'Blazo - Intro',
266 'timestamp': 1311756226,
267 'upload_date': '20110727',
268 'uploader': 'Blazo',
add96eb9 269 },
d35dc6d3
JMF
270 },
271 {
3798eadc
PH
272 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
273 'info_dict': {
13ba3a64
PH
274 'id': '38097443',
275 'ext': 'mp3',
8bdd16b4 276 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
277 'timestamp': 1311757238,
278 'upload_date': '20110727',
279 'uploader': 'Blazo',
add96eb9 280 },
d35dc6d3
JMF
281 },
282 ],
13ba3a64
PH
283 'info_dict': {
284 'title': 'Jazz Format Mixtape vol.1',
72c1f8de
PH
285 'id': 'jazz-format-mixtape-vol-1',
286 'uploader_id': 'blazo',
13ba3a64 287 },
3798eadc 288 'params': {
add96eb9 289 'playlistend': 2,
d35dc6d3 290 },
add96eb9 291 'skip': 'Bandcamp imposes download limits.',
22a6f150
PH
292 }, {
293 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
294 'info_dict': {
295 'title': 'Hierophany of the Open Grave',
72c1f8de
PH
296 'uploader_id': 'nightbringer',
297 'id': 'hierophany-of-the-open-grave',
22a6f150
PH
298 },
299 'playlist_mincount': 9,
64fc49ab
S
300 }, {
301 # with escaped quote in title
302 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
303 'info_dict': {
304 'title': '"Entropy" EP',
305 'uploader_id': 'jstrecords',
306 'id': 'entropy-ep',
8bdd16b4 307 'description': 'md5:0ff22959c943622972596062f2f366a5',
64fc49ab
S
308 },
309 'playlist_mincount': 3,
019f4c03
YCH
310 }, {
311 # not all tracks have songs
312 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
313 'info_dict': {
314 'id': 'we-are-the-plague',
315 'title': 'WE ARE THE PLAGUE',
316 'uploader_id': 'insulters',
8bdd16b4 317 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
019f4c03
YCH
318 },
319 'playlist_count': 2,
22a6f150 320 }]
d35dc6d3 321
62bafabc
AV
322 @classmethod
323 def suitable(cls, url):
6d923aab
S
324 return (False
325 if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
add96eb9 326 else super().suitable(url))
62bafabc 327
09804265 328 def _real_extract(self, url):
5ad28e7f 329 uploader_id, album_id = self._match_valid_url(url).groups()
72c1f8de
PH
330 playlist_id = album_id or uploader_id
331 webpage = self._download_webpage(url, playlist_id)
8bdd16b4 332 tralbum = self._extract_data_attr(webpage, playlist_id)
333 track_info = tralbum.get('trackinfo')
334 if not track_info:
335 raise ExtractorError('The page doesn\'t contain any tracks')
019f4c03 336 # Only tracks with duration info have songs
09804265 337 entries = [
8239c679 338 self.url_result(
8bdd16b4 339 urljoin(url, t['title_link']), BandcampIE.ie_key(),
340 str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
341 for t in track_info
342 if t.get('duration')]
343
344 current = tralbum.get('current') or {}
3a379e5e 345
09804265
JMF
346 return {
347 '_type': 'playlist',
72c1f8de 348 'uploader_id': uploader_id,
b48f147d 349 'id': playlist_id,
8bdd16b4 350 'title': current.get('title'),
351 'description': current.get('about'),
352 'entries': entries,
09804265 353 }
62bafabc
AV
354
355
6368e2e6 356class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
6d923aab
S
357 IE_NAME = 'Bandcamp:weekly'
358 _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
62bafabc
AV
359 _TESTS = [{
360 'url': 'https://bandcamp.com/?show=224',
361 'md5': 'b00df799c733cf7e0c567ed187dea0fd',
362 'info_dict': {
363 'id': '224',
364 'ext': 'opus',
6d923aab
S
365 'title': 'BC Weekly April 4th 2017 - Magic Moments',
366 'description': 'md5:5d48150916e8e02d030623a48512c874',
367 'duration': 5829.77,
368 'release_date': '20170404',
369 'series': 'Bandcamp Weekly',
370 'episode': 'Magic Moments',
6d923aab 371 'episode_id': '224',
8bdd16b4 372 },
373 'params': {
374 'format': 'opus-lo',
375 },
62bafabc
AV
376 }, {
377 'url': 'https://bandcamp.com/?blah/blah@&show=228',
add96eb9 378 'only_matching': True,
62bafabc
AV
379 }]
380
381 def _real_extract(self, url):
8bdd16b4 382 show_id = self._match_id(url)
383 webpage = self._download_webpage(url, show_id)
62bafabc 384
8bdd16b4 385 blob = self._extract_data_attr(webpage, show_id, 'blob')
62bafabc 386
8bdd16b4 387 show = blob['bcw_data'][show_id]
62bafabc 388
6d923aab
S
389 formats = []
390 for format_id, format_url in show['audio_stream'].items():
3052a30d 391 if not url_or_none(format_url):
6d923aab
S
392 continue
393 for known_ext in KNOWN_EXTENSIONS:
394 if known_ext in format_id:
395 ext = known_ext
396 break
397 else:
398 ext = None
399 formats.append({
400 'format_id': format_id,
401 'url': format_url,
402 'ext': ext,
403 'vcodec': 'none',
404 })
62bafabc 405
6d923aab
S
406 title = show.get('audio_title') or 'Bandcamp Weekly'
407 subtitle = show.get('subtitle')
408 if subtitle:
add96eb9 409 title += f' - {subtitle}'
62bafabc 410
62bafabc 411 return {
8bdd16b4 412 'id': show_id,
6d923aab
S
413 'title': title,
414 'description': show.get('desc') or show.get('short_desc'),
62bafabc 415 'duration': float_or_none(show.get('audio_duration')),
62bafabc
AV
416 'is_live': False,
417 'release_date': unified_strdate(show.get('published_date')),
418 'series': 'Bandcamp Weekly',
6d923aab 419 'episode': show.get('subtitle'),
8bdd16b4 420 'episode_id': show_id,
add96eb9 421 'formats': formats,
62bafabc 422 }
bc874548
A
423
424
85a0ad01 425class BandcampUserIE(InfoExtractor):
426 IE_NAME = 'Bandcamp:user'
427 _VALID_URL = r'https?://(?!www\.)(?P<id>[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)'
428
bc874548 429 _TESTS = [{
85a0ad01 430 # Type 1 Bandcamp user page.
431 'url': 'https://adrianvonziegler.bandcamp.com',
432 'info_dict': {
433 'id': 'adrianvonziegler',
434 'title': 'Discography of adrianvonziegler',
435 },
436 'playlist_mincount': 23,
437 }, {
438 # Bandcamp user page with only one album
439 'url': 'http://dotscale.bandcamp.com',
440 'info_dict': {
441 'id': 'dotscale',
add96eb9 442 'title': 'Discography of dotscale',
85a0ad01 443 },
444 'playlist_count': 1,
445 }, {
446 # Type 2 Bandcamp user page.
447 'url': 'https://nightcallofficial.bandcamp.com',
448 'info_dict': {
449 'id': 'nightcallofficial',
450 'title': 'Discography of nightcallofficial',
451 },
452 'playlist_count': 4,
453 }, {
bc874548
A
454 'url': 'https://steviasphere.bandcamp.com/music',
455 'playlist_mincount': 47,
456 'info_dict': {
457 'id': 'steviasphere',
85a0ad01 458 'title': 'Discography of steviasphere',
bc874548
A
459 },
460 }, {
461 'url': 'https://coldworldofficial.bandcamp.com/music',
462 'playlist_mincount': 10,
463 'info_dict': {
464 'id': 'coldworldofficial',
85a0ad01 465 'title': 'Discography of coldworldofficial',
bc874548
A
466 },
467 }, {
468 'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
469 'playlist_mincount': 399,
470 'info_dict': {
471 'id': 'nuclearwarnowproductions',
85a0ad01 472 'title': 'Discography of nuclearwarnowproductions',
bc874548 473 },
85a0ad01 474 }]
bc874548
A
475
476 def _real_extract(self, url):
85a0ad01 477 uploader = self._match_id(url)
478 webpage = self._download_webpage(url, uploader)
479
96b49af0 480 discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
85a0ad01 481 or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
482
483 return self.playlist_from_matches(
484 discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x))