]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..utils import float_or_none, int_or_none, parse_iso8601, url_or_none | |
5 | from ..utils.traversal import traverse_obj | |
6 | ||
7 | ||
8 | class Art19IE(InfoExtractor): | |
9 | _UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}' | |
10 | _VALID_URL = [ | |
11 | rf'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P<id>{_UUID_REGEX})', | |
12 | rf'https?://rss\.art19\.com/episodes/(?P<id>{_UUID_REGEX})\.mp3', | |
13 | ] | |
14 | _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL[0]})'] | |
15 | ||
16 | _TESTS = [{ | |
17 | 'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3', | |
18 | 'info_dict': { | |
19 | 'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', | |
20 | 'ext': 'mp3', | |
21 | 'title': 'Why Did DeSantis Drop Out?', | |
22 | 'series': 'The Daily Briefing', | |
23 | 'release_timestamp': 1705941275, | |
24 | 'description': 'md5:da38961da4a3f7e419471365e3c6b49f', | |
25 | 'episode': 'Episode 582', | |
26 | 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', | |
27 | 'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d', | |
28 | 'upload_date': '20240122', | |
29 | 'timestamp': 1705940815, | |
30 | 'episode_number': 582, | |
31 | 'modified_date': '20240122', | |
32 | 'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', | |
33 | 'modified_timestamp': 1705941275, | |
34 | 'release_date': '20240122', | |
35 | 'duration': 527.4, | |
36 | }, | |
37 | }, { | |
38 | 'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd', | |
39 | 'info_dict': { | |
40 | 'id': '8319b776-4153-4d22-8630-631f204a03dd', | |
41 | 'ext': 'mp3', | |
42 | 'title': 'Martha Stewart: The Homemaker Hustler Part 2', | |
43 | 'modified_date': '20240116', | |
44 | 'upload_date': '20240105', | |
45 | 'modified_timestamp': 1705435802, | |
46 | 'episode_id': '8319b776-4153-4d22-8630-631f204a03dd', | |
47 | 'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', | |
48 | 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', | |
49 | 'description': 'md5:4aa7cfd1358dc57e729835bc208d7893', | |
50 | 'release_timestamp': 1705305660, | |
51 | 'release_date': '20240115', | |
52 | 'timestamp': 1704481536, | |
53 | 'episode_number': 88, | |
54 | 'series': 'Scamfluencers', | |
55 | 'duration': 2588.37501, | |
56 | 'episode': 'Episode 88', | |
57 | }, | |
58 | }] | |
59 | _WEBPAGE_TESTS = [{ | |
60 | 'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html', | |
61 | 'info_dict': { | |
62 | 'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', | |
63 | 'ext': 'mp3', | |
64 | 'title': "'Verstappen wordt een synoniem voor Formule 1'", | |
65 | 'season': 'Seizoen 6', | |
66 | 'description': 'md5:39a7159a31c4cda312b2e893bdd5c071', | |
67 | 'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', | |
68 | 'duration': 3061.82111, | |
69 | 'series_id': '93f4e113-2a60-4609-a564-755058fa40d8', | |
70 | 'release_date': '20231126', | |
71 | 'modified_timestamp': 1701156004, | |
72 | 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', | |
73 | 'season_number': 6, | |
74 | 'episode_number': 52, | |
75 | 'modified_date': '20231128', | |
76 | 'upload_date': '20231126', | |
77 | 'timestamp': 1701025981, | |
78 | 'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26', | |
79 | 'series': 'De Boordradio', | |
80 | 'release_timestamp': 1701026308, | |
81 | 'episode': 'Episode 52', | |
82 | }, | |
83 | }, { | |
84 | 'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/', | |
85 | 'info_dict': { | |
86 | 'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', | |
87 | 'ext': 'mp3', | |
88 | 'title': 'Larry Bucshon announces retirement from congress', | |
89 | 'upload_date': '20240115', | |
90 | 'episode_number': 148, | |
91 | 'episode': 'Episode 148', | |
92 | 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', | |
93 | 'release_date': '20240115', | |
94 | 'timestamp': 1705328205, | |
95 | 'release_timestamp': 1705329275, | |
96 | 'series': 'All INdiana Politics', | |
97 | 'modified_date': '20240117', | |
98 | 'modified_timestamp': 1705458901, | |
99 | 'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1', | |
100 | 'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', | |
101 | 'description': 'md5:53b5239e4d14973a87125c217c255b2a', | |
102 | 'duration': 1256.18848, | |
103 | }, | |
104 | }] | |
105 | ||
106 | @classmethod | |
107 | def _extract_embed_urls(cls, url, webpage): | |
108 | yield from super()._extract_embed_urls(url, webpage) | |
109 | for episode_id in re.findall( | |
110 | rf'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({cls._UUID_REGEX})[\'"]', webpage): | |
111 | yield f'https://rss.art19.com/episodes/{episode_id}.mp3' | |
112 | ||
113 | def _real_extract(self, url): | |
114 | episode_id = self._match_id(url) | |
115 | ||
116 | player_metadata = self._download_json( | |
117 | f'https://art19.com/episodes/{episode_id}', episode_id, | |
118 | note='Downloading player metadata', fatal=False, | |
119 | headers={'Accept': 'application/vnd.art19.v0+json'}) | |
120 | rss_metadata = self._download_json( | |
121 | f'https://rss.art19.com/episodes/{episode_id}.json', episode_id, fatal=False, | |
122 | note='Downloading RSS metadata') | |
123 | ||
124 | formats = [{ | |
125 | 'format_id': 'direct', | |
126 | 'url': f'https://rss.art19.com/episodes/{episode_id}.mp3', | |
127 | 'vcodec': 'none', | |
128 | 'acodec': 'mp3', | |
129 | }] | |
130 | for fmt_id, fmt_data in traverse_obj(rss_metadata, ('content', 'media', {dict.items}, ...)): | |
131 | if fmt_id == 'waveform_bin': | |
132 | continue | |
133 | fmt_url = traverse_obj(fmt_data, ('url', {url_or_none})) | |
134 | if not fmt_url: | |
135 | continue | |
136 | formats.append({ | |
137 | 'format_id': fmt_id, | |
138 | 'url': fmt_url, | |
139 | 'vcodec': 'none', | |
140 | 'acodec': fmt_id, | |
141 | 'quality': -2 if fmt_id == 'ogg' else -1, | |
142 | }) | |
143 | ||
144 | return { | |
145 | 'id': episode_id, | |
146 | 'formats': formats, | |
147 | **traverse_obj(player_metadata, ('episode', { | |
148 | 'title': ('title', {str}), | |
149 | 'description': ('description_plain', {str}), | |
150 | 'episode_id': ('id', {str}), | |
151 | 'episode_number': ('episode_number', {int_or_none}), | |
152 | 'season_id': ('season_id', {str}), | |
153 | 'series_id': ('series_id', {str}), | |
154 | 'timestamp': ('created_at', {parse_iso8601}), | |
155 | 'release_timestamp': ('released_at', {parse_iso8601}), | |
156 | 'modified_timestamp': ('updated_at', {parse_iso8601}) | |
157 | })), | |
158 | **traverse_obj(rss_metadata, ('content', { | |
159 | 'title': ('episode_title', {str}), | |
160 | 'description': ('episode_description_plain', {str}), | |
161 | 'episode_id': ('episode_id', {str}), | |
162 | 'episode_number': ('episode_number', {int_or_none}), | |
163 | 'season': ('season_title', {str}), | |
164 | 'season_id': ('season_id', {str}), | |
165 | 'season_number': ('season_number', {int_or_none}), | |
166 | 'series': ('series_title', {str}), | |
167 | 'series_id': ('series_id', {str}), | |
168 | 'thumbnail': ('cover_image', {url_or_none}), | |
169 | 'duration': ('duration', {float_or_none}), | |
170 | })), | |
171 | } | |
172 | ||
173 | ||
174 | class Art19ShowIE(InfoExtractor): | |
175 | _VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P<id>[\w-]+)(?:/embed)?/?' | |
176 | _VALID_URL = [ | |
177 | rf'{_VALID_URL_BASE}(?:$|[#?])', | |
178 | r'https?://rss\.art19\.com/(?P<id>[\w-]+)/?(?:$|[#?])', | |
179 | ] | |
180 | _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL_BASE}[^\'"])'] | |
181 | ||
182 | _TESTS = [{ | |
183 | 'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/', | |
184 | 'info_dict': { | |
185 | '_type': 'playlist', | |
186 | 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0', | |
187 | 'display_id': 'echt-gebeurd', | |
188 | 'title': 'Echt Gebeurd', | |
189 | 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560', | |
190 | 'timestamp': 1492642167, | |
191 | 'upload_date': '20170419', | |
192 | 'modified_timestamp': int, | |
193 | 'modified_date': str, | |
194 | 'tags': 'count:7', | |
195 | }, | |
196 | 'playlist_mincount': 425, | |
197 | }, { | |
198 | 'url': 'https://www.art19.com/shows/echt-gebeurd', | |
199 | 'info_dict': { | |
200 | '_type': 'playlist', | |
201 | 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0', | |
202 | 'display_id': 'echt-gebeurd', | |
203 | 'title': 'Echt Gebeurd', | |
204 | 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560', | |
205 | 'timestamp': 1492642167, | |
206 | 'upload_date': '20170419', | |
207 | 'modified_timestamp': int, | |
208 | 'modified_date': str, | |
209 | 'tags': 'count:7', | |
210 | }, | |
211 | 'playlist_mincount': 425, | |
212 | }, { | |
213 | 'url': 'https://rss.art19.com/scamfluencers', | |
214 | 'info_dict': { | |
215 | '_type': 'playlist', | |
216 | 'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', | |
217 | 'display_id': 'scamfluencers', | |
218 | 'title': 'Scamfluencers', | |
219 | 'description': 'md5:7d239d670c0ced6dadbf71c4caf764b7', | |
220 | 'timestamp': 1647368573, | |
221 | 'upload_date': '20220315', | |
222 | 'modified_timestamp': int, | |
223 | 'modified_date': str, | |
224 | 'tags': [], | |
225 | }, | |
226 | 'playlist_mincount': 90, | |
227 | }, { | |
228 | 'url': 'https://art19.com/shows/enthuellt/embed', | |
229 | 'info_dict': { | |
230 | '_type': 'playlist', | |
231 | 'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c', | |
232 | 'display_id': 'enthuellt', | |
233 | 'title': 'Enthüllt', | |
234 | 'description': 'md5:17752246643414a2fd51744fc9a1c08e', | |
235 | 'timestamp': 1601645860, | |
236 | 'upload_date': '20201002', | |
237 | 'modified_timestamp': int, | |
238 | 'modified_date': str, | |
239 | 'tags': 'count:10', | |
240 | }, | |
241 | 'playlist_mincount': 10, | |
242 | }] | |
243 | _WEBPAGE_TESTS = [{ | |
244 | 'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast', | |
245 | 'info_dict': { | |
246 | '_type': 'playlist', | |
247 | 'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21', | |
248 | 'display_id': 'deconstructing-yourself', | |
249 | 'title': 'Deconstructing Yourself', | |
250 | 'description': 'md5:dab5082b28b248a35476abf64768854d', | |
251 | 'timestamp': 1570581181, | |
252 | 'upload_date': '20191009', | |
253 | 'modified_timestamp': int, | |
254 | 'modified_date': str, | |
255 | 'tags': 'count:5', | |
256 | }, | |
257 | 'playlist_mincount': 80, | |
258 | }, { | |
259 | 'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/', | |
260 | 'info_dict': { | |
261 | '_type': 'playlist', | |
262 | 'id': '9dfa2c37-ab87-4c13-8388-4897914313ec', | |
263 | 'display_id': 'the-ben-joravsky-show', | |
264 | 'title': 'The Ben Joravsky Show', | |
265 | 'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a', | |
266 | 'timestamp': 1550875095, | |
267 | 'upload_date': '20190222', | |
268 | 'modified_timestamp': int, | |
269 | 'modified_date': str, | |
270 | 'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'], | |
271 | }, | |
272 | 'playlist_mincount': 1900, | |
273 | }] | |
274 | ||
275 | @classmethod | |
276 | def _extract_embed_urls(cls, url, webpage): | |
277 | yield from super()._extract_embed_urls(url, webpage) | |
278 | for series_id in re.findall( | |
279 | r'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage): | |
280 | yield f'https://art19.com/shows/{series_id}' | |
281 | ||
282 | def _real_extract(self, url): | |
283 | series_id = self._match_id(url) | |
284 | series_metadata = self._download_json( | |
285 | f'https://art19.com/series/{series_id}', series_id, note='Downloading series metadata', | |
286 | headers={'Accept': 'application/vnd.art19.v0+json'}) | |
287 | ||
288 | return { | |
289 | '_type': 'playlist', | |
290 | 'entries': [ | |
291 | self.url_result(f'https://rss.art19.com/episodes/{episode_id}.mp3', Art19IE) | |
292 | for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', ..., {str})) | |
293 | ], | |
294 | **traverse_obj(series_metadata, ('series', { | |
295 | 'id': ('id', {str}), | |
296 | 'display_id': ('slug', {str}), | |
297 | 'title': ('title', {str}), | |
298 | 'description': ('description_plain', {str}), | |
299 | 'timestamp': ('created_at', {parse_iso8601}), | |
300 | 'modified_timestamp': ('updated_at', {parse_iso8601}), | |
301 | })), | |
302 | 'tags': traverse_obj(series_metadata, ('tags', ..., 'name', {str})), | |
303 | } |