]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/spreaker.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / spreaker.py
1 import itertools
2
3 from .common import InfoExtractor
4 from ..compat import compat_str
5 from ..utils import (
6 float_or_none,
7 int_or_none,
8 str_or_none,
9 try_get,
10 unified_timestamp,
11 url_or_none,
12 )
13
14
15 def _extract_episode(data, episode_id=None):
16 title = data['title']
17 download_url = data['download_url']
18
19 series = try_get(data, lambda x: x['show']['title'], compat_str)
20 uploader = try_get(data, lambda x: x['author']['fullname'], compat_str)
21
22 thumbnails = []
23 for image in ('image_original', 'image_medium', 'image'):
24 image_url = url_or_none(data.get('%s_url' % image))
25 if image_url:
26 thumbnails.append({'url': image_url})
27
28 def stats(key):
29 return int_or_none(try_get(
30 data,
31 (lambda x: x['%ss_count' % key],
32 lambda x: x['stats']['%ss' % key])))
33
34 def duration(key):
35 return float_or_none(data.get(key), scale=1000)
36
37 return {
38 'id': compat_str(episode_id or data['episode_id']),
39 'url': download_url,
40 'display_id': data.get('permalink'),
41 'title': title,
42 'description': data.get('description'),
43 'timestamp': unified_timestamp(data.get('published_at')),
44 'uploader': uploader,
45 'uploader_id': str_or_none(data.get('author_id')),
46 'creator': uploader,
47 'duration': duration('duration') or duration('length'),
48 'view_count': stats('play'),
49 'like_count': stats('like'),
50 'comment_count': stats('message'),
51 'format': 'MPEG Layer 3',
52 'format_id': 'mp3',
53 'container': 'mp3',
54 'ext': 'mp3',
55 'thumbnails': thumbnails,
56 'series': series,
57 'extractor_key': SpreakerIE.ie_key(),
58 }
59
60
61 class SpreakerIE(InfoExtractor):
62 _VALID_URL = r'''(?x)
63 https?://
64 api\.spreaker\.com/
65 (?:
66 (?:download/)?episode|
67 v2/episodes
68 )/
69 (?P<id>\d+)
70 '''
71 _TESTS = [{
72 'url': 'https://api.spreaker.com/episode/12534508',
73 'info_dict': {
74 'id': '12534508',
75 'display_id': 'swm-ep15-how-to-market-your-music-part-2',
76 'ext': 'mp3',
77 'title': 'EP:15 | Music Marketing (Likes) - Part 2',
78 'description': 'md5:0588c43e27be46423e183076fa071177',
79 'timestamp': 1502250336,
80 'upload_date': '20170809',
81 'uploader': 'SWM',
82 'uploader_id': '9780658',
83 'duration': 1063.42,
84 'view_count': int,
85 'like_count': int,
86 'comment_count': int,
87 'series': 'Success With Music (SWM)',
88 },
89 }, {
90 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3',
91 'only_matching': True,
92 }, {
93 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments',
94 'only_matching': True,
95 }]
96
97 def _real_extract(self, url):
98 episode_id = self._match_id(url)
99 data = self._download_json(
100 'https://api.spreaker.com/v2/episodes/%s' % episode_id,
101 episode_id)['response']['episode']
102 return _extract_episode(data, episode_id)
103
104
105 class SpreakerPageIE(InfoExtractor):
106 _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)'
107 _TESTS = [{
108 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2',
109 'only_matching': True,
110 }]
111
112 def _real_extract(self, url):
113 display_id = self._match_id(url)
114 webpage = self._download_webpage(url, display_id)
115 episode_id = self._search_regex(
116 (r'data-episode_id=["\'](?P<id>\d+)',
117 r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id')
118 return self.url_result(
119 'https://api.spreaker.com/episode/%s' % episode_id,
120 ie=SpreakerIE.ie_key(), video_id=episode_id)
121
122
123 class SpreakerShowIE(InfoExtractor):
124 _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)'
125 _TESTS = [{
126 'url': 'https://api.spreaker.com/show/4652058',
127 'info_dict': {
128 'id': '4652058',
129 },
130 'playlist_mincount': 118,
131 }]
132
133 def _entries(self, show_id):
134 for page_num in itertools.count(1):
135 episodes = self._download_json(
136 'https://api.spreaker.com/show/%s/episodes' % show_id,
137 show_id, note='Downloading JSON page %d' % page_num, query={
138 'page': page_num,
139 'max_per_page': 100,
140 })
141 pager = try_get(episodes, lambda x: x['response']['pager'], dict)
142 if not pager:
143 break
144 results = pager.get('results')
145 if not results or not isinstance(results, list):
146 break
147 for result in results:
148 if not isinstance(result, dict):
149 continue
150 yield _extract_episode(result)
151 if page_num == pager.get('last_page'):
152 break
153
154 def _real_extract(self, url):
155 show_id = self._match_id(url)
156 return self.playlist_result(self._entries(show_id), playlist_id=show_id)
157
158
159 class SpreakerShowPageIE(InfoExtractor):
160 _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)'
161 _TESTS = [{
162 'url': 'https://www.spreaker.com/show/success-with-music',
163 'only_matching': True,
164 }]
165
166 def _real_extract(self, url):
167 display_id = self._match_id(url)
168 webpage = self._download_webpage(url, display_id)
169 show_id = self._search_regex(
170 r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id')
171 return self.url_result(
172 'https://api.spreaker.com/show/%s' % show_id,
173 ie=SpreakerShowIE.ie_key(), video_id=show_id)