]>
Commit | Line | Data |
---|---|---|
1 | import itertools | |
2 | import json | |
3 | import math | |
4 | import re | |
5 | import urllib.parse | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from ..utils import ( | |
9 | ExtractorError, | |
10 | InAdvancePagedList, | |
11 | determine_ext, | |
12 | extract_attributes, | |
13 | int_or_none, | |
14 | js_to_json, | |
15 | parse_iso8601, | |
16 | strip_or_none, | |
17 | traverse_obj, | |
18 | unescapeHTML, | |
19 | unified_timestamp, | |
20 | url_or_none, | |
21 | urljoin, | |
22 | ) | |
23 | ||
24 | ||
25 | class PolskieRadioBaseExtractor(InfoExtractor): | |
26 | def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): | |
27 | media_urls = set() | |
28 | ||
29 | for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): | |
30 | media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) | |
31 | if not media.get('file') or not media.get('desc'): | |
32 | continue | |
33 | media_url = self._proto_relative_url(media['file']) | |
34 | if media_url in media_urls: | |
35 | continue | |
36 | media_urls.add(media_url) | |
37 | entry = base_data.copy() | |
38 | entry.update({ | |
39 | 'id': str(media['id']), | |
40 | 'url': media_url, | |
41 | 'duration': int_or_none(media.get('length')), | |
42 | 'vcodec': 'none' if media.get('provider') == 'audio' else None, | |
43 | }) | |
44 | entry_title = urllib.parse.unquote(media['desc']) | |
45 | if entry_title: | |
46 | entry['title'] = entry_title | |
47 | yield entry | |
48 | ||
49 | ||
50 | class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): | |
51 | # legacy sites | |
52 | IE_NAME = 'polskieradio:legacy' | |
53 | _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)' | |
54 | _TESTS = [{ | |
55 | 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', | |
56 | 'info_dict': { | |
57 | 'id': '2534482', | |
58 | 'title': 'Żagaryści. Poezja jak spoiwo', | |
59 | 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695', | |
60 | }, | |
61 | 'playlist': [{ | |
62 | 'md5': 'd07559829f61d5a93a75755987ded760', | |
63 | 'info_dict': { | |
64 | 'id': '2516679', | |
65 | 'ext': 'mp3', | |
66 | 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c', | |
67 | 'timestamp': 1592654400, | |
68 | 'upload_date': '20200620', | |
69 | 'duration': 1430, | |
70 | 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$', | |
71 | }, | |
72 | }], | |
73 | }, { | |
74 | # PR4 audition - other frontend | |
75 | 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', | |
76 | 'info_dict': { | |
77 | 'id': '2610977', | |
78 | 'ext': 'mp3', | |
79 | 'title': 'Pogłos 29 października godz. 23:01', | |
80 | }, | |
81 | }, { | |
82 | 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', | |
83 | 'only_matching': True, | |
84 | }] | |
85 | ||
86 | def _real_extract(self, url): | |
87 | playlist_id = self._match_id(url) | |
88 | ||
89 | webpage, urlh = self._download_webpage_handle(url, playlist_id) | |
90 | if PolskieRadioIE.suitable(urlh.url): | |
91 | return self.url_result(urlh.url, PolskieRadioIE, playlist_id) | |
92 | ||
93 | content = self._search_regex( | |
94 | r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', | |
95 | webpage, 'content', default=None) | |
96 | ||
97 | timestamp = unified_timestamp(self._html_search_regex( | |
98 | r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', | |
99 | webpage, 'timestamp', default=None)) | |
100 | ||
101 | thumbnail_url = self._og_search_thumbnail(webpage, default=None) | |
102 | ||
103 | title = self._og_search_title(webpage).strip() | |
104 | ||
105 | description = strip_or_none(self._og_search_description(webpage, default=None)) | |
106 | description = description.replace('\xa0', ' ') if description is not None else None | |
107 | ||
108 | if not content: | |
109 | return { | |
110 | 'id': playlist_id, | |
111 | 'url': self._proto_relative_url( | |
112 | self._search_regex( | |
113 | r"source:\s*'(//static\.prsa\.pl/[^']+)'", | |
114 | webpage, 'audition record url')), | |
115 | 'title': title, | |
116 | 'description': description, | |
117 | 'timestamp': timestamp, | |
118 | 'thumbnail': thumbnail_url, | |
119 | } | |
120 | ||
121 | entries = self._extract_webpage_player_entries(content, playlist_id, { | |
122 | 'title': title, | |
123 | 'timestamp': timestamp, | |
124 | 'thumbnail': thumbnail_url, | |
125 | }) | |
126 | ||
127 | return self.playlist_result(entries, playlist_id, title, description) | |
128 | ||
129 | ||
130 | class PolskieRadioIE(PolskieRadioBaseExtractor): | |
131 | # new next.js sites | |
132 | _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)' | |
133 | _TESTS = [{ | |
134 | # articleData, attachments | |
135 | 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', | |
136 | 'info_dict': { | |
137 | 'id': '1587943', | |
138 | 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', | |
139 | 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', | |
140 | }, | |
141 | 'playlist': [{ | |
142 | 'md5': '2984ee6ce9046d91fc233bc1a864a09a', | |
143 | 'info_dict': { | |
144 | 'id': '7a85d429-5356-4def-a347-925e4ae7406b', | |
145 | 'ext': 'mp3', | |
146 | 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', | |
147 | }, | |
148 | }], | |
149 | }, { | |
150 | # post, legacy html players | |
151 | 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager', | |
152 | 'info_dict': { | |
153 | 'id': '2589163', | |
154 | 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?', | |
155 | 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473', | |
156 | }, | |
157 | 'playlist': [{ | |
158 | 'info_dict': { | |
159 | 'id': '2577880', | |
160 | 'ext': 'mp3', | |
161 | 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a', | |
162 | 'duration': 321, | |
163 | }, | |
164 | }], | |
165 | }, { | |
166 | # data, legacy | |
167 | 'url': 'https://radiokierowcow.pl/artykul/2694529', | |
168 | 'info_dict': { | |
169 | 'id': '2694529', | |
170 | 'title': 'Zielona fala reliktem przeszłości?', | |
171 | 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0', | |
172 | }, | |
173 | 'playlist_count': 3, | |
174 | }, { | |
175 | 'url': 'https://trojka.polskieradio.pl/artykul/1632955', | |
176 | 'only_matching': True, | |
177 | }, { | |
178 | # with mp4 video | |
179 | 'url': 'https://trojka.polskieradio.pl/artykul/1634903', | |
180 | 'only_matching': True, | |
181 | }, { | |
182 | 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego', | |
183 | 'only_matching': True, | |
184 | }] | |
185 | ||
186 | def _real_extract(self, url): | |
187 | playlist_id = self._match_id(url) | |
188 | ||
189 | webpage = self._download_webpage(url, playlist_id) | |
190 | ||
191 | article_data = traverse_obj( | |
192 | self._search_nextjs_data(webpage, playlist_id), ( | |
193 | 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False) | |
194 | ||
195 | title = strip_or_none(article_data['title']) | |
196 | ||
197 | description = strip_or_none(article_data.get('lead')) | |
198 | ||
199 | entries = [{ | |
200 | 'url': entry['file'], | |
201 | 'ext': determine_ext(entry.get('fileName')), | |
202 | 'id': self._search_regex( | |
203 | r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), | |
204 | 'title': strip_or_none(entry.get('description')) or title, | |
205 | } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )] | |
206 | ||
207 | if not entries: | |
208 | # some legacy articles have no json attachments, but players in body | |
209 | entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, { | |
210 | 'title': title, | |
211 | }) | |
212 | ||
213 | return self.playlist_result(entries, playlist_id, title, description) | |
214 | ||
215 | ||
216 | class PolskieRadioAuditionIE(InfoExtractor): | |
217 | # new next.js sites | |
218 | IE_NAME = 'polskieradio:audition' | |
219 | _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)' | |
220 | _TESTS = [{ | |
221 | # articles, PR1 | |
222 | 'url': 'https://jedynka.polskieradio.pl/audycje/5102', | |
223 | 'info_dict': { | |
224 | 'id': '5102', | |
225 | 'title': 'Historia żywa', | |
226 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
227 | }, | |
228 | 'playlist_mincount': 38, | |
229 | }, { | |
230 | # episodes, PR1 | |
231 | 'url': 'https://jedynka.polskieradio.pl/audycje/5769', | |
232 | 'info_dict': { | |
233 | 'id': '5769', | |
234 | 'title': 'AgroFakty', | |
235 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
236 | }, | |
237 | 'playlist_mincount': 269, | |
238 | }, { | |
239 | # both episodes and articles, PR3 | |
240 | 'url': 'https://trojka.polskieradio.pl/audycja/8906', | |
241 | 'info_dict': { | |
242 | 'id': '8906', | |
243 | 'title': 'Trójka budzi', | |
244 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
245 | }, | |
246 | 'playlist_mincount': 722, | |
247 | }, { | |
248 | # some articles were "promoted to main page" and thus link to old frontend | |
249 | 'url': 'https://trojka.polskieradio.pl/audycja/305', | |
250 | 'info_dict': { | |
251 | 'id': '305', | |
252 | 'title': 'Co w mowie piszczy?', | |
253 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
254 | }, | |
255 | 'playlist_count': 1523, | |
256 | }] | |
257 | ||
258 | def _call_lp3(self, path, query, video_id, note): | |
259 | return self._download_json( | |
260 | f'https://lp3test.polskieradio.pl/{path}', video_id, note, | |
261 | query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'}) | |
262 | ||
263 | def _entries(self, playlist_id, has_episodes, has_articles): | |
264 | for i in itertools.count(0) if has_episodes else []: | |
265 | page = self._call_lp3( | |
266 | 'AudioArticle/GetListByCategoryId', { | |
267 | 'categoryId': playlist_id, | |
268 | 'PageSize': 10, | |
269 | 'skip': i, | |
270 | 'format': 400, | |
271 | }, playlist_id, f'Downloading episode list page {i + 1}') | |
272 | if not traverse_obj(page, 'data'): | |
273 | break | |
274 | for episode in page['data']: | |
275 | yield { | |
276 | 'id': str(episode['id']), | |
277 | 'url': episode['file'], | |
278 | 'title': episode.get('title'), | |
279 | 'duration': int_or_none(episode.get('duration')), | |
280 | 'timestamp': parse_iso8601(episode.get('datePublic')), | |
281 | } | |
282 | ||
283 | for i in itertools.count(0) if has_articles else []: | |
284 | page = self._call_lp3( | |
285 | 'Article/GetListByCategoryId', { | |
286 | 'categoryId': playlist_id, | |
287 | 'PageSize': 9, | |
288 | 'skip': i, | |
289 | 'format': 400, | |
290 | }, playlist_id, f'Downloading article list page {i + 1}') | |
291 | if not traverse_obj(page, 'data'): | |
292 | break | |
293 | for article in page['data']: | |
294 | yield { | |
295 | '_type': 'url_transparent', | |
296 | 'id': str(article['id']), | |
297 | 'url': article['url'], | |
298 | 'title': article.get('shortTitle'), | |
299 | 'description': traverse_obj(article, ('description', 'lead')), | |
300 | 'timestamp': parse_iso8601(article.get('datePublic')), | |
301 | } | |
302 | ||
303 | def _real_extract(self, url): | |
304 | playlist_id = self._match_id(url) | |
305 | ||
306 | page_props = traverse_obj( | |
307 | self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id), | |
308 | ('props', 'pageProps', ('data', None)), get_all=False) | |
309 | ||
310 | has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios')) | |
311 | has_articles = bool(traverse_obj(page_props, 'articles')) | |
312 | ||
313 | return self.playlist_result( | |
314 | self._entries(playlist_id, has_episodes, has_articles), playlist_id, | |
315 | title=traverse_obj(page_props, ('details', 'name')), | |
316 | description=traverse_obj(page_props, ('details', 'description', 'lead')), | |
317 | thumbnail=traverse_obj(page_props, ('details', 'photo'))) | |
318 | ||
319 | ||
320 | class PolskieRadioCategoryIE(InfoExtractor): | |
321 | # legacy sites | |
322 | IE_NAME = 'polskieradio:category' | |
323 | _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)' | |
324 | _TESTS = [{ | |
325 | 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', | |
326 | 'info_dict': { | |
327 | 'id': '4143', | |
328 | 'title': 'Kierunek Kraków', | |
329 | }, | |
330 | 'playlist_mincount': 61, | |
331 | }, { | |
332 | 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', | |
333 | 'info_dict': { | |
334 | 'id': '214', | |
335 | 'title': 'Muzyka', | |
336 | }, | |
337 | 'playlist_mincount': 61, | |
338 | }, { | |
339 | # billennium tabs | |
340 | 'url': 'https://www.polskieradio.pl/8/2385', | |
341 | 'info_dict': { | |
342 | 'id': '2385', | |
343 | 'title': 'Droga przez mąkę', | |
344 | }, | |
345 | 'playlist_mincount': 111, | |
346 | }, { | |
347 | 'url': 'https://www.polskieradio.pl/10/4930', | |
348 | 'info_dict': { | |
349 | 'id': '4930', | |
350 | 'title': 'Teraz K-pop!', | |
351 | }, | |
352 | 'playlist_mincount': 392, | |
353 | }, { | |
354 | # post back pages, audio content directly without articles | |
355 | 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa', | |
356 | 'info_dict': { | |
357 | 'id': '7376', | |
358 | 'title': 'Nowa mowa', | |
359 | }, | |
360 | 'playlist_mincount': 244, | |
361 | }, { | |
362 | 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458', | |
363 | 'info_dict': { | |
364 | 'id': '175458', | |
365 | 'title': 'Krzysztof Dziuba', | |
366 | }, | |
367 | 'playlist_mincount': 420, | |
368 | }, { | |
369 | 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', | |
370 | 'only_matching': True, | |
371 | }] | |
372 | ||
373 | @classmethod | |
374 | def suitable(cls, url): | |
375 | return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url) | |
376 | ||
377 | def _entries(self, url, page, category_id): | |
378 | content = page | |
379 | is_billennium_tabs = 'onclick="TB_LoadTab(' in page | |
380 | is_post_back = 'onclick="__doPostBack(' in page | |
381 | pagination = page if is_billennium_tabs else None | |
382 | for page_num in itertools.count(2): | |
383 | for a_entry, entry_id in re.findall( | |
384 | r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', | |
385 | content): | |
386 | entry = extract_attributes(a_entry) | |
387 | if entry.get('href'): | |
388 | yield self.url_result( | |
389 | urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title')) | |
390 | for a_entry in re.findall(r'<span data-media=({[^ ]+})', content): | |
391 | yield traverse_obj(self._parse_json(a_entry, category_id), { | |
392 | 'url': 'file', | |
393 | 'id': 'uid', | |
394 | 'duration': 'length', | |
395 | 'title': ('title', {urllib.parse.unquote}), | |
396 | 'description': ('desc', {urllib.parse.unquote}), | |
397 | }) | |
398 | if is_billennium_tabs: | |
399 | params = self._search_json( | |
400 | r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(', | |
401 | pagination, 'next page params', category_id, default=None, close_objects=1, | |
402 | contains_pattern='.+', transform_source=lambda x: f'[{js_to_json(unescapeHTML(x))}') | |
403 | if not params: | |
404 | break | |
405 | tab_content = self._download_json( | |
406 | 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', | |
407 | category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, | |
408 | data=json.dumps(dict(zip(( | |
409 | 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', | |
410 | 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', | |
411 | 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber', | |
412 | ), params))).encode())['d'] | |
413 | content, pagination = tab_content['Content'], tab_content.get('PagerContent') | |
414 | elif is_post_back: | |
415 | target = self._search_regex( | |
416 | r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)', | |
417 | content, 'pagination postback target', group='target', default=None) | |
418 | if not target: | |
419 | break | |
420 | content = self._download_webpage( | |
421 | url, category_id, f'Downloading page {page_num}', | |
422 | data=urllib.parse.urlencode({ | |
423 | **self._hidden_inputs(content), | |
424 | '__EVENTTARGET': target, | |
425 | '__EVENTARGUMENT': 'Next', | |
426 | }).encode()) | |
427 | else: | |
428 | next_url = urljoin(url, self._search_regex( | |
429 | r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', | |
430 | content, 'next page url', group='url', default=None)) | |
431 | if not next_url: | |
432 | break | |
433 | content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}') | |
434 | ||
435 | def _real_extract(self, url): | |
436 | category_id = self._match_id(url) | |
437 | webpage, urlh = self._download_webpage_handle(url, category_id) | |
438 | if PolskieRadioAuditionIE.suitable(urlh.url): | |
439 | return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) | |
440 | title = self._html_search_regex( | |
441 | r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)</title>', | |
442 | webpage, 'title', fatal=False) | |
443 | return self.playlist_result( | |
444 | self._entries(url, webpage, category_id), | |
445 | category_id, title) | |
446 | ||
447 | ||
448 | class PolskieRadioPlayerIE(InfoExtractor): | |
449 | IE_NAME = 'polskieradio:player' | |
450 | _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)' | |
451 | ||
452 | _BASE_URL = 'https://player.polskieradio.pl' | |
453 | _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' | |
454 | _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' | |
455 | ||
456 | _TESTS = [{ | |
457 | 'url': 'https://player.polskieradio.pl/anteny/trojka', | |
458 | 'info_dict': { | |
459 | 'id': '3', | |
460 | 'ext': 'm4a', | |
461 | 'title': 'Trójka', | |
462 | }, | |
463 | 'params': { | |
464 | 'format': 'bestaudio', | |
465 | 'skip_download': 'endless stream', | |
466 | }, | |
467 | }] | |
468 | ||
469 | def _get_channel_list(self, channel_url='no_channel'): | |
470 | player_code = self._download_webpage( | |
471 | self._PLAYER_URL, channel_url, | |
472 | note='Downloading js player') | |
473 | channel_list = js_to_json(self._search_regex( | |
474 | r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) | |
475 | return self._parse_json(channel_list, channel_url) | |
476 | ||
477 | def _real_extract(self, url): | |
478 | channel_url = self._match_id(url) | |
479 | channel_list = self._get_channel_list(channel_url) | |
480 | ||
481 | channel = next((c for c in channel_list if c.get('url') == channel_url), None) | |
482 | ||
483 | if not channel: | |
484 | raise ExtractorError('Channel not found') | |
485 | ||
486 | station_list = self._download_json(self._STATIONS_API_URL, channel_url, | |
487 | note='Downloading stream url list', | |
488 | headers={ | |
489 | 'Accept': 'application/json', | |
490 | 'Referer': url, | |
491 | 'Origin': self._BASE_URL, | |
492 | }) | |
493 | station = next((s for s in station_list | |
494 | if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) | |
495 | if not station: | |
496 | raise ExtractorError('Station not found even though we extracted channel') | |
497 | ||
498 | formats = [] | |
499 | for stream_url in station['Streams']: | |
500 | stream_url = self._proto_relative_url(stream_url) | |
501 | if stream_url.endswith('/playlist.m3u8'): | |
502 | formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) | |
503 | elif stream_url.endswith('/manifest.f4m'): | |
504 | formats.extend(self._extract_mpd_formats(stream_url, channel_url)) | |
505 | elif stream_url.endswith('/Manifest'): | |
506 | formats.extend(self._extract_ism_formats(stream_url, channel_url)) | |
507 | else: | |
508 | formats.append({ | |
509 | 'url': stream_url, | |
510 | }) | |
511 | ||
512 | return { | |
513 | 'id': str(channel['id']), | |
514 | 'formats': formats, | |
515 | 'title': channel.get('name') or channel.get('streamName'), | |
516 | 'display_id': channel_url, | |
517 | 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', | |
518 | 'is_live': True, | |
519 | } | |
520 | ||
521 | ||
522 | class PolskieRadioPodcastBaseExtractor(InfoExtractor): | |
523 | _API_BASE = 'https://apipodcasts.polskieradio.pl/api' | |
524 | ||
525 | def _parse_episode(self, data): | |
526 | return { | |
527 | 'id': data['guid'], | |
528 | 'formats': [{ | |
529 | 'url': data['url'], | |
530 | 'filesize': int_or_none(data.get('fileSize')), | |
531 | }], | |
532 | 'title': data['title'], | |
533 | 'description': data.get('description'), | |
534 | 'duration': int_or_none(data.get('length')), | |
535 | 'timestamp': parse_iso8601(data.get('publishDate')), | |
536 | 'thumbnail': url_or_none(data.get('image')), | |
537 | 'series': data.get('podcastTitle'), | |
538 | 'episode': data['title'], | |
539 | } | |
540 | ||
541 | ||
542 | class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): | |
543 | IE_NAME = 'polskieradio:podcast:list' | |
544 | _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)' | |
545 | _TESTS = [{ | |
546 | 'url': 'https://podcasty.polskieradio.pl/podcast/8/', | |
547 | 'info_dict': { | |
548 | 'id': '8', | |
549 | 'title': 'Śniadanie w Trójce', | |
550 | 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', | |
551 | 'uploader': 'Beata Michniewicz', | |
552 | }, | |
553 | 'playlist_mincount': 714, | |
554 | }] | |
555 | _PAGE_SIZE = 10 | |
556 | ||
557 | def _call_api(self, podcast_id, page): | |
558 | return self._download_json( | |
559 | f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', | |
560 | podcast_id, f'Downloading page {page}') | |
561 | ||
562 | def _real_extract(self, url): | |
563 | podcast_id = self._match_id(url) | |
564 | data = self._call_api(podcast_id, 1) | |
565 | ||
566 | def get_page(page_num): | |
567 | page_data = self._call_api(podcast_id, page_num + 1) if page_num else data | |
568 | yield from (self._parse_episode(ep) for ep in page_data['items']) | |
569 | ||
570 | return { | |
571 | '_type': 'playlist', | |
572 | 'entries': InAdvancePagedList( | |
573 | get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), | |
574 | 'id': str(data['id']), | |
575 | 'title': data.get('title'), | |
576 | 'description': data.get('description'), | |
577 | 'uploader': data.get('announcer'), | |
578 | } | |
579 | ||
580 | ||
581 | class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): | |
582 | IE_NAME = 'polskieradio:podcast' | |
583 | _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' | |
584 | _TESTS = [{ | |
585 | 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', | |
586 | 'info_dict': { | |
587 | 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', | |
588 | 'ext': 'mp3', | |
589 | 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', | |
590 | 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', | |
591 | 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?', | |
592 | 'duration': 2893, | |
593 | 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg', | |
594 | 'series': 'Raport o stanie świata', | |
595 | }, | |
596 | }] | |
597 | ||
598 | def _real_extract(self, url): | |
599 | podcast_id = self._match_id(url) | |
600 | data = self._download_json( | |
601 | f'{self._API_BASE}/audio', | |
602 | podcast_id, 'Downloading podcast metadata', | |
603 | data=json.dumps({ | |
604 | 'guids': [podcast_id], | |
605 | }).encode(), | |
606 | headers={ | |
607 | 'Content-Type': 'application/json', | |
608 | }) | |
609 | return self._parse_episode(data[0]) |