]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/art19.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / art19.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import float_or_none, int_or_none, parse_iso8601, url_or_none
5 from ..utils.traversal import traverse_obj
6
7
8 class Art19IE(InfoExtractor):
9 _UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}'
10 _VALID_URL = [
11 rf'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P<id>{_UUID_REGEX})',
12 rf'https?://rss\.art19\.com/episodes/(?P<id>{_UUID_REGEX})\.mp3',
13 ]
14 _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL[0]})']
15
16 _TESTS = [{
17 'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3',
18 'info_dict': {
19 'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
20 'ext': 'mp3',
21 'title': 'Why Did DeSantis Drop Out?',
22 'series': 'The Daily Briefing',
23 'release_timestamp': 1705941275,
24 'description': 'md5:da38961da4a3f7e419471365e3c6b49f',
25 'episode': 'Episode 582',
26 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
27 'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d',
28 'upload_date': '20240122',
29 'timestamp': 1705940815,
30 'episode_number': 582,
31 'modified_date': '20240122',
32 'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
33 'modified_timestamp': 1705941275,
34 'release_date': '20240122',
35 'duration': 527.4,
36 },
37 }, {
38 'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd',
39 'info_dict': {
40 'id': '8319b776-4153-4d22-8630-631f204a03dd',
41 'ext': 'mp3',
42 'title': 'Martha Stewart: The Homemaker Hustler Part 2',
43 'modified_date': '20240116',
44 'upload_date': '20240105',
45 'modified_timestamp': 1705435802,
46 'episode_id': '8319b776-4153-4d22-8630-631f204a03dd',
47 'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
48 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
49 'description': 'md5:4aa7cfd1358dc57e729835bc208d7893',
50 'release_timestamp': 1705305660,
51 'release_date': '20240115',
52 'timestamp': 1704481536,
53 'episode_number': 88,
54 'series': 'Scamfluencers',
55 'duration': 2588.37501,
56 'episode': 'Episode 88',
57 },
58 }]
59 _WEBPAGE_TESTS = [{
60 'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html',
61 'info_dict': {
62 'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
63 'ext': 'mp3',
64 'title': "'Verstappen wordt een synoniem voor Formule 1'",
65 'season': 'Seizoen 6',
66 'description': 'md5:39a7159a31c4cda312b2e893bdd5c071',
67 'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
68 'duration': 3061.82111,
69 'series_id': '93f4e113-2a60-4609-a564-755058fa40d8',
70 'release_date': '20231126',
71 'modified_timestamp': 1701156004,
72 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
73 'season_number': 6,
74 'episode_number': 52,
75 'modified_date': '20231128',
76 'upload_date': '20231126',
77 'timestamp': 1701025981,
78 'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26',
79 'series': 'De Boordradio',
80 'release_timestamp': 1701026308,
81 'episode': 'Episode 52',
82 },
83 }, {
84 'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/',
85 'info_dict': {
86 'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
87 'ext': 'mp3',
88 'title': 'Larry Bucshon announces retirement from congress',
89 'upload_date': '20240115',
90 'episode_number': 148,
91 'episode': 'Episode 148',
92 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
93 'release_date': '20240115',
94 'timestamp': 1705328205,
95 'release_timestamp': 1705329275,
96 'series': 'All INdiana Politics',
97 'modified_date': '20240117',
98 'modified_timestamp': 1705458901,
99 'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1',
100 'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
101 'description': 'md5:53b5239e4d14973a87125c217c255b2a',
102 'duration': 1256.18848,
103 },
104 }]
105
106 @classmethod
107 def _extract_embed_urls(cls, url, webpage):
108 yield from super()._extract_embed_urls(url, webpage)
109 for episode_id in re.findall(
110 rf'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({cls._UUID_REGEX})[\'"]', webpage):
111 yield f'https://rss.art19.com/episodes/{episode_id}.mp3'
112
113 def _real_extract(self, url):
114 episode_id = self._match_id(url)
115
116 player_metadata = self._download_json(
117 f'https://art19.com/episodes/{episode_id}', episode_id,
118 note='Downloading player metadata', fatal=False,
119 headers={'Accept': 'application/vnd.art19.v0+json'})
120 rss_metadata = self._download_json(
121 f'https://rss.art19.com/episodes/{episode_id}.json', episode_id, fatal=False,
122 note='Downloading RSS metadata')
123
124 formats = [{
125 'format_id': 'direct',
126 'url': f'https://rss.art19.com/episodes/{episode_id}.mp3',
127 'vcodec': 'none',
128 'acodec': 'mp3',
129 }]
130 for fmt_id, fmt_data in traverse_obj(rss_metadata, ('content', 'media', {dict.items}, ...)):
131 if fmt_id == 'waveform_bin':
132 continue
133 fmt_url = traverse_obj(fmt_data, ('url', {url_or_none}))
134 if not fmt_url:
135 continue
136 formats.append({
137 'format_id': fmt_id,
138 'url': fmt_url,
139 'vcodec': 'none',
140 'acodec': fmt_id,
141 'quality': -2 if fmt_id == 'ogg' else -1,
142 })
143
144 return {
145 'id': episode_id,
146 'formats': formats,
147 **traverse_obj(player_metadata, ('episode', {
148 'title': ('title', {str}),
149 'description': ('description_plain', {str}),
150 'episode_id': ('id', {str}),
151 'episode_number': ('episode_number', {int_or_none}),
152 'season_id': ('season_id', {str}),
153 'series_id': ('series_id', {str}),
154 'timestamp': ('created_at', {parse_iso8601}),
155 'release_timestamp': ('released_at', {parse_iso8601}),
156 'modified_timestamp': ('updated_at', {parse_iso8601}),
157 })),
158 **traverse_obj(rss_metadata, ('content', {
159 'title': ('episode_title', {str}),
160 'description': ('episode_description_plain', {str}),
161 'episode_id': ('episode_id', {str}),
162 'episode_number': ('episode_number', {int_or_none}),
163 'season': ('season_title', {str}),
164 'season_id': ('season_id', {str}),
165 'season_number': ('season_number', {int_or_none}),
166 'series': ('series_title', {str}),
167 'series_id': ('series_id', {str}),
168 'thumbnail': ('cover_image', {url_or_none}),
169 'duration': ('duration', {float_or_none}),
170 })),
171 }
172
173
174 class Art19ShowIE(InfoExtractor):
175 _VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P<id>[\w-]+)(?:/embed)?/?'
176 _VALID_URL = [
177 rf'{_VALID_URL_BASE}(?:$|[#?])',
178 r'https?://rss\.art19\.com/(?P<id>[\w-]+)/?(?:$|[#?])',
179 ]
180 _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL_BASE}[^\'"])']
181
182 _TESTS = [{
183 'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/',
184 'info_dict': {
185 '_type': 'playlist',
186 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
187 'display_id': 'echt-gebeurd',
188 'title': 'Echt Gebeurd',
189 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
190 'timestamp': 1492642167,
191 'upload_date': '20170419',
192 'modified_timestamp': int,
193 'modified_date': str,
194 'tags': 'count:7',
195 },
196 'playlist_mincount': 425,
197 }, {
198 'url': 'https://www.art19.com/shows/echt-gebeurd',
199 'info_dict': {
200 '_type': 'playlist',
201 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
202 'display_id': 'echt-gebeurd',
203 'title': 'Echt Gebeurd',
204 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
205 'timestamp': 1492642167,
206 'upload_date': '20170419',
207 'modified_timestamp': int,
208 'modified_date': str,
209 'tags': 'count:7',
210 },
211 'playlist_mincount': 425,
212 }, {
213 'url': 'https://rss.art19.com/scamfluencers',
214 'info_dict': {
215 '_type': 'playlist',
216 'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
217 'display_id': 'scamfluencers',
218 'title': 'Scamfluencers',
219 'description': 'md5:7d239d670c0ced6dadbf71c4caf764b7',
220 'timestamp': 1647368573,
221 'upload_date': '20220315',
222 'modified_timestamp': int,
223 'modified_date': str,
224 'tags': [],
225 },
226 'playlist_mincount': 90,
227 }, {
228 'url': 'https://art19.com/shows/enthuellt/embed',
229 'info_dict': {
230 '_type': 'playlist',
231 'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c',
232 'display_id': 'enthuellt',
233 'title': 'Enthüllt',
234 'description': 'md5:17752246643414a2fd51744fc9a1c08e',
235 'timestamp': 1601645860,
236 'upload_date': '20201002',
237 'modified_timestamp': int,
238 'modified_date': str,
239 'tags': 'count:10',
240 },
241 'playlist_mincount': 10,
242 }]
243 _WEBPAGE_TESTS = [{
244 'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast',
245 'info_dict': {
246 '_type': 'playlist',
247 'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21',
248 'display_id': 'deconstructing-yourself',
249 'title': 'Deconstructing Yourself',
250 'description': 'md5:dab5082b28b248a35476abf64768854d',
251 'timestamp': 1570581181,
252 'upload_date': '20191009',
253 'modified_timestamp': int,
254 'modified_date': str,
255 'tags': 'count:5',
256 },
257 'playlist_mincount': 80,
258 }, {
259 'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/',
260 'info_dict': {
261 '_type': 'playlist',
262 'id': '9dfa2c37-ab87-4c13-8388-4897914313ec',
263 'display_id': 'the-ben-joravsky-show',
264 'title': 'The Ben Joravsky Show',
265 'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a',
266 'timestamp': 1550875095,
267 'upload_date': '20190222',
268 'modified_timestamp': int,
269 'modified_date': str,
270 'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'],
271 },
272 'playlist_mincount': 1900,
273 }]
274
275 @classmethod
276 def _extract_embed_urls(cls, url, webpage):
277 yield from super()._extract_embed_urls(url, webpage)
278 for series_id in re.findall(
279 r'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage):
280 yield f'https://art19.com/shows/{series_id}'
281
282 def _real_extract(self, url):
283 series_id = self._match_id(url)
284 series_metadata = self._download_json(
285 f'https://art19.com/series/{series_id}', series_id, note='Downloading series metadata',
286 headers={'Accept': 'application/vnd.art19.v0+json'})
287
288 return {
289 '_type': 'playlist',
290 'entries': [
291 self.url_result(f'https://rss.art19.com/episodes/{episode_id}.mp3', Art19IE)
292 for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', ..., {str}))
293 ],
294 **traverse_obj(series_metadata, ('series', {
295 'id': ('id', {str}),
296 'display_id': ('slug', {str}),
297 'title': ('title', {str}),
298 'description': ('description_plain', {str}),
299 'timestamp': ('created_at', {parse_iso8601}),
300 'modified_timestamp': ('updated_at', {parse_iso8601}),
301 })),
302 'tags': traverse_obj(series_metadata, ('tags', ..., 'name', {str})),
303 }