]> jfr.im git - yt-dlp.git/commitdiff
[extractor/listennotes] Add extractor (#5310)
authorAlex Karabanov <redacted>
Sun, 6 Nov 2022 18:30:59 +0000 (22:30 +0400)
committerGitHub <redacted>
Sun, 6 Nov 2022 18:30:59 +0000 (00:00 +0530)
Closes #5262
Authored by: lksj, pukkandan

yt_dlp/compat/__init__.py
yt_dlp/extractor/_extractors.py
yt_dlp/extractor/listennotes.py [new file with mode: 0644]

index 6d85a6a1fbf59f44083f86c60403aff122dc23f0..5d3db4b4ca00982d5485af42e5007d90176d3ec7 100644 (file)
@@ -14,7 +14,7 @@
 # HTMLParseError has been deprecated in Python 3.3 and removed in
 # Python 3.5. Introducing dummy exception for Python >3.5 for compatible
 # and uniform cross-version exception handling
-class compat_HTMLParseError(Exception):
+class compat_HTMLParseError(ValueError):
     pass
 
 
index 1960692ef33d386c170bde832cba0013414552a7..8c70d15850e3595df6cf86454877db17c6e9ff5d 100644 (file)
 )
 from .linuxacademy import LinuxAcademyIE
 from .liputan6 import Liputan6IE
+from .listennotes import ListenNotesIE
 from .litv import LiTVIE
 from .livejournal import LiveJournalIE
 from .livestream import (
diff --git a/yt_dlp/extractor/listennotes.py b/yt_dlp/extractor/listennotes.py
new file mode 100644 (file)
index 0000000..4ebc9be
--- /dev/null
@@ -0,0 +1,86 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    extract_attributes,
+    get_element_by_class,
+    get_element_html_by_id,
+    get_element_text_and_html_by_tag,
+    parse_duration,
+    strip_or_none,
+    traverse_obj,
+    try_call,
+)
+
+
+class ListenNotesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?listennotes\.com/podcasts/[^/]+/[^/]+-(?P<id>.+)/'
+    _TESTS = [{
+        'url': 'https://www.listennotes.com/podcasts/thriving-on-overload/tim-oreilly-on-noticing-KrDgvNb_u1n/',
+        'md5': '5b91a32f841e5788fb82b72a1a8af7f7',
+        'info_dict': {
+            'id': 'KrDgvNb_u1n',
+            'ext': 'mp3',
+            'title': 'md5:32236591a921adf17bbdbf0441b6c0e9',
+            'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd',
+            'duration': 2148.0,
+            'channel': 'Thriving on Overload',
+            'channel_id': 'ed84wITivxF',
+            'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
+            'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg',
+            'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/',
+            'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
+        }
+    }, {
+        'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/',
+        'md5': '62fb4ffe7fc525632a1138bf72a5ce53',
+        'info_dict': {
+            'id': 'lwEA3154JzG',
+            'ext': 'mp3',
+            'title': 'Episode 177: WireGuard with Jason Donenfeld',
+            'description': 'md5:24744f36456a3e95f83c1193a3458594',
+            'duration': 3861.0,
+            'channel': 'Ask Noah Show',
+            'channel_id': '4DQTzdS5-j7',
+            'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
+            'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
+            'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg',
+            'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
+        }
+    }]
+
+    def _clean_description(self, description):
+        return clean_html(re.sub(r'(</?(div|p)>\s*)+', '<br/><br/>', description or ''))
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+        webpage = self._download_webpage(url, audio_id)
+        data = self._search_json(
+            r'<script id="original-content"[^>]+\btype="application/json">', webpage, 'content', audio_id)
+        data.update(extract_attributes(get_element_html_by_id(
+            r'episode-play-button-toolbar|episode-no-play-button-toolbar', webpage, escape_value=False)))
+
+        duration, description = self._search_regex(
+            r'(?P<duration>[\d:]+)\s*-\s*(?P<description>.+)',
+            self._html_search_meta(['og:description', 'description', 'twitter:description'], webpage),
+            'description', fatal=False, group=('duration', 'description')) or (None, None)
+
+        return {
+            'id': audio_id,
+            'url': data['audio'],
+            'title': (data.get('data-title')
+                      or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
+                      or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
+            'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
+                            or strip_or_none(description)),
+            'duration': parse_duration(traverse_obj(data, 'audio_length', 'data-duration') or duration),
+            'episode_id': traverse_obj(data, 'uuid', 'data-episode-uuid'),
+            **traverse_obj(data, {
+                'thumbnail': 'data-image',
+                'channel': 'data-channel-title',
+                'cast': ('nlp_entities', ..., 'name'),
+                'channel_url': 'channel_url',
+                'channel_id': 'channel_short_uuid',
+            })
+        }