]>
Commit | Line | Data |
---|---|---|
0d2a0eca AK |
1 | import re |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
5 | clean_html, | |
6 | extract_attributes, | |
7 | get_element_by_class, | |
8 | get_element_html_by_id, | |
9 | get_element_text_and_html_by_tag, | |
10 | parse_duration, | |
11 | strip_or_none, | |
12 | traverse_obj, | |
13 | try_call, | |
14 | ) | |
15 | ||
16 | ||
17 | class ListenNotesIE(InfoExtractor): | |
18 | _VALID_URL = r'https?://(?:www\.)?listennotes\.com/podcasts/[^/]+/[^/]+-(?P<id>.+)/' | |
19 | _TESTS = [{ | |
20 | 'url': 'https://www.listennotes.com/podcasts/thriving-on-overload/tim-oreilly-on-noticing-KrDgvNb_u1n/', | |
21 | 'md5': '5b91a32f841e5788fb82b72a1a8af7f7', | |
22 | 'info_dict': { | |
23 | 'id': 'KrDgvNb_u1n', | |
24 | 'ext': 'mp3', | |
25 | 'title': 'md5:32236591a921adf17bbdbf0441b6c0e9', | |
26 | 'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd', | |
27 | 'duration': 2148.0, | |
28 | 'channel': 'Thriving on Overload', | |
29 | 'channel_id': 'ed84wITivxF', | |
30 | 'episode_id': 'e1312583fa7b4e24acfbb5131050be00', | |
31 | 'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg', | |
32 | 'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/', | |
33 | 'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'], | |
add96eb9 | 34 | }, |
0d2a0eca AK |
35 | }, { |
36 | 'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/', | |
37 | 'md5': '62fb4ffe7fc525632a1138bf72a5ce53', | |
38 | 'info_dict': { | |
39 | 'id': 'lwEA3154JzG', | |
40 | 'ext': 'mp3', | |
41 | 'title': 'Episode 177: WireGuard with Jason Donenfeld', | |
42 | 'description': 'md5:24744f36456a3e95f83c1193a3458594', | |
43 | 'duration': 3861.0, | |
44 | 'channel': 'Ask Noah Show', | |
45 | 'channel_id': '4DQTzdS5-j7', | |
46 | 'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4', | |
47 | 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/', | |
48 | 'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg', | |
49 | 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'], | |
add96eb9 | 50 | }, |
0d2a0eca AK |
51 | }] |
52 | ||
53 | def _clean_description(self, description): | |
54 | return clean_html(re.sub(r'(</?(div|p)>\s*)+', '<br/><br/>', description or '')) | |
55 | ||
56 | def _real_extract(self, url): | |
57 | audio_id = self._match_id(url) | |
58 | webpage = self._download_webpage(url, audio_id) | |
59 | data = self._search_json( | |
60 | r'<script id="original-content"[^>]+\btype="application/json">', webpage, 'content', audio_id) | |
61 | data.update(extract_attributes(get_element_html_by_id( | |
62 | r'episode-play-button-toolbar|episode-no-play-button-toolbar', webpage, escape_value=False))) | |
63 | ||
64 | duration, description = self._search_regex( | |
65 | r'(?P<duration>[\d:]+)\s*-\s*(?P<description>.+)', | |
66 | self._html_search_meta(['og:description', 'description', 'twitter:description'], webpage), | |
67 | 'description', fatal=False, group=('duration', 'description')) or (None, None) | |
68 | ||
69 | return { | |
70 | 'id': audio_id, | |
71 | 'url': data['audio'], | |
72 | 'title': (data.get('data-title') | |
73 | or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0]) | |
74 | or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')), | |
75 | 'description': (self._clean_description(get_element_by_class('ln-text-p', webpage)) | |
76 | or strip_or_none(description)), | |
77 | 'duration': parse_duration(traverse_obj(data, 'audio_length', 'data-duration') or duration), | |
78 | 'episode_id': traverse_obj(data, 'uuid', 'data-episode-uuid'), | |
79 | **traverse_obj(data, { | |
80 | 'thumbnail': 'data-image', | |
81 | 'channel': 'data-channel-title', | |
82 | 'cast': ('nlp_entities', ..., 'name'), | |
83 | 'channel_url': 'channel_url', | |
84 | 'channel_id': 'channel_short_uuid', | |
add96eb9 | 85 | }), |
0d2a0eca | 86 | } |