jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	clean_html,
	6	extract_attributes,
	7	get_element_by_class,
	8	get_element_html_by_class,
	9	get_element_text_and_html_by_tag,
	10	int_or_none,
	11	strip_or_none,
	12	traverse_obj,
	13	try_call,
	14	unified_strdate,
	15	)
	16
	17
	18	class MonstercatIE(InfoExtractor):
	19	_VALID_URL = r'https?://www\.monstercat\.com/release/(?P<id>\d+)'
	20	_TESTS = [{
	21	'url': 'https://www.monstercat.com/release/742779548009',
	22	'playlist_count': 20,
	23	'info_dict': {
	24	'title': 'The Secret Language of Trees',
	25	'id': '742779548009',
	26	'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
	27	'release_date': '20230711',
	28	'album': 'The Secret Language of Trees',
	29	'album_artist': 'BT',
	30	}
	31	}]
	32
	33	def _extract_tracks(self, table, album_meta):
	34	for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag
	35	title = clean_html(try_call(
	36	lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0]))
	37	ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '')
	38	track_id = ids.get('data-track-id')
	39	release_id = ids.get('data-release-id')
	40
	41	track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td)))
	42	if not track_id or not release_id:
	43	self.report_warning(f'Skipping track {track_number}, ID(s) not found')
	44	self.write_debug(f'release_id={repr(release_id)} track_id={repr(track_id)}')
	45	continue
	46	yield {
	47	**album_meta,
	48	'title': title,
	49	'track': title,
	50	'track_number': track_number,
	51	'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))),
	52	'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
	53	'id': track_id,
	54	'ext': 'mp3'
	55	}
	56
	57	def _real_extract(self, url):
	58	url_id = self._match_id(url)
	59	html = self._download_webpage(url, url_id)
	60	# wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html
	61	tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or ''
	62
	63	title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0])
	64	date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block',
	65	html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate}))
	66
	67	album_meta = {
	68	'title': title,
	69	'album': title,
	70	'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
	71	'album_artist': try_call(
	72	lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)),
	73	'release_date': date,
	74	}
	75
	76	return self.playlist_result(
	77	self._extract_tracks(tracklist_table, album_meta), playlist_id=url_id, **album_meta)