]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/monstercat.py
[core] Parse `release_year` from `release_date` (#8524)
[yt-dlp.git] / yt_dlp / extractor / monstercat.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 clean_html,
6 extract_attributes,
7 get_element_by_class,
8 get_element_html_by_class,
9 get_element_text_and_html_by_tag,
10 int_or_none,
11 unified_strdate,
12 strip_or_none,
13 traverse_obj,
14 try_call,
15 )
16
17
18 class MonstercatIE(InfoExtractor):
19 _VALID_URL = r'https://www\.monstercat\.com/release/(?P<id>\d+)'
20 _TESTS = [{
21 'url': 'https://www.monstercat.com/release/742779548009',
22 'playlist_count': 20,
23 'info_dict': {
24 'title': 'The Secret Language of Trees',
25 'id': '742779548009',
26 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
27 'release_date': '20230711',
28 'album': 'The Secret Language of Trees',
29 'album_artist': 'BT',
30 }
31 }]
32
33 def _extract_tracks(self, table, album_meta):
34 for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag
35 title = clean_html(try_call(
36 lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0]))
37 ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '')
38 track_id = ids.get('data-track-id')
39 release_id = ids.get('data-release-id')
40
41 track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td)))
42 if not track_id or not release_id:
43 self.report_warning(f'Skipping track {track_number}, ID(s) not found')
44 self.write_debug(f'release_id={repr(release_id)} track_id={repr(track_id)}')
45 continue
46 yield {
47 **album_meta,
48 'title': title,
49 'track': title,
50 'track_number': track_number,
51 'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))),
52 'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
53 'id': track_id,
54 'ext': 'mp3'
55 }
56
57 def _real_extract(self, url):
58 url_id = self._match_id(url)
59 html = self._download_webpage(url, url_id)
60 # wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html
61 tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or ''
62
63 title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0])
64 date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block',
65 html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate}))
66
67 album_meta = {
68 'title': title,
69 'album': title,
70 'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
71 'album_artist': try_call(
72 lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)),
73 'release_date': date,
74 }
75
76 return self.playlist_result(
77 self._extract_tracks(tracklist_table, album_meta), playlist_id=url_id, **album_meta)