]>
Commit | Line | Data |
---|---|---|
1 | import itertools | |
2 | import json | |
3 | import math | |
4 | import re | |
5 | import urllib.parse | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from ..compat import compat_str | |
9 | from ..utils import ( | |
10 | ExtractorError, | |
11 | InAdvancePagedList, | |
12 | determine_ext, | |
13 | extract_attributes, | |
14 | int_or_none, | |
15 | js_to_json, | |
16 | parse_iso8601, | |
17 | strip_or_none, | |
18 | traverse_obj, | |
19 | unescapeHTML, | |
20 | unified_timestamp, | |
21 | url_or_none, | |
22 | urljoin, | |
23 | ) | |
24 | ||
25 | ||
26 | class PolskieRadioBaseExtractor(InfoExtractor): | |
27 | def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): | |
28 | media_urls = set() | |
29 | ||
30 | for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): | |
31 | media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) | |
32 | if not media.get('file') or not media.get('desc'): | |
33 | continue | |
34 | media_url = self._proto_relative_url(media['file']) | |
35 | if media_url in media_urls: | |
36 | continue | |
37 | media_urls.add(media_url) | |
38 | entry = base_data.copy() | |
39 | entry.update({ | |
40 | 'id': compat_str(media['id']), | |
41 | 'url': media_url, | |
42 | 'duration': int_or_none(media.get('length')), | |
43 | 'vcodec': 'none' if media.get('provider') == 'audio' else None, | |
44 | }) | |
45 | entry_title = urllib.parse.unquote(media['desc']) | |
46 | if entry_title: | |
47 | entry['title'] = entry_title | |
48 | yield entry | |
49 | ||
50 | ||
51 | class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): | |
52 | # legacy sites | |
53 | IE_NAME = 'polskieradio:legacy' | |
54 | _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)' | |
55 | _TESTS = [{ | |
56 | 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', | |
57 | 'info_dict': { | |
58 | 'id': '2534482', | |
59 | 'title': 'Żagaryści. Poezja jak spoiwo', | |
60 | 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695', | |
61 | }, | |
62 | 'playlist': [{ | |
63 | 'md5': 'd07559829f61d5a93a75755987ded760', | |
64 | 'info_dict': { | |
65 | 'id': '2516679', | |
66 | 'ext': 'mp3', | |
67 | 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c', | |
68 | 'timestamp': 1592654400, | |
69 | 'upload_date': '20200620', | |
70 | 'duration': 1430, | |
71 | 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' | |
72 | }, | |
73 | }], | |
74 | }, { | |
75 | # PR4 audition - other frontend | |
76 | 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', | |
77 | 'info_dict': { | |
78 | 'id': '2610977', | |
79 | 'ext': 'mp3', | |
80 | 'title': 'Pogłos 29 października godz. 23:01', | |
81 | }, | |
82 | }, { | |
83 | 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', | |
84 | 'only_matching': True, | |
85 | }] | |
86 | ||
87 | def _real_extract(self, url): | |
88 | playlist_id = self._match_id(url) | |
89 | ||
90 | webpage, urlh = self._download_webpage_handle(url, playlist_id) | |
91 | if PolskieRadioIE.suitable(urlh.url): | |
92 | return self.url_result(urlh.url, PolskieRadioIE, playlist_id) | |
93 | ||
94 | content = self._search_regex( | |
95 | r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', | |
96 | webpage, 'content', default=None) | |
97 | ||
98 | timestamp = unified_timestamp(self._html_search_regex( | |
99 | r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', | |
100 | webpage, 'timestamp', default=None)) | |
101 | ||
102 | thumbnail_url = self._og_search_thumbnail(webpage, default=None) | |
103 | ||
104 | title = self._og_search_title(webpage).strip() | |
105 | ||
106 | description = strip_or_none(self._og_search_description(webpage, default=None)) | |
107 | description = description.replace('\xa0', ' ') if description is not None else None | |
108 | ||
109 | if not content: | |
110 | return { | |
111 | 'id': playlist_id, | |
112 | 'url': self._proto_relative_url( | |
113 | self._search_regex( | |
114 | r"source:\s*'(//static\.prsa\.pl/[^']+)'", | |
115 | webpage, 'audition record url')), | |
116 | 'title': title, | |
117 | 'description': description, | |
118 | 'timestamp': timestamp, | |
119 | 'thumbnail': thumbnail_url, | |
120 | } | |
121 | ||
122 | entries = self._extract_webpage_player_entries(content, playlist_id, { | |
123 | 'title': title, | |
124 | 'timestamp': timestamp, | |
125 | 'thumbnail': thumbnail_url, | |
126 | }) | |
127 | ||
128 | return self.playlist_result(entries, playlist_id, title, description) | |
129 | ||
130 | ||
131 | class PolskieRadioIE(PolskieRadioBaseExtractor): | |
132 | # new next.js sites | |
133 | _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)' | |
134 | _TESTS = [{ | |
135 | # articleData, attachments | |
136 | 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', | |
137 | 'info_dict': { | |
138 | 'id': '1587943', | |
139 | 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', | |
140 | 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', | |
141 | }, | |
142 | 'playlist': [{ | |
143 | 'md5': '2984ee6ce9046d91fc233bc1a864a09a', | |
144 | 'info_dict': { | |
145 | 'id': '7a85d429-5356-4def-a347-925e4ae7406b', | |
146 | 'ext': 'mp3', | |
147 | 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', | |
148 | }, | |
149 | }], | |
150 | }, { | |
151 | # post, legacy html players | |
152 | 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager', | |
153 | 'info_dict': { | |
154 | 'id': '2589163', | |
155 | 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?', | |
156 | 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473', | |
157 | }, | |
158 | 'playlist': [{ | |
159 | 'info_dict': { | |
160 | 'id': '2577880', | |
161 | 'ext': 'mp3', | |
162 | 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a', | |
163 | 'duration': 321, | |
164 | }, | |
165 | }], | |
166 | }, { | |
167 | # data, legacy | |
168 | 'url': 'https://radiokierowcow.pl/artykul/2694529', | |
169 | 'info_dict': { | |
170 | 'id': '2694529', | |
171 | 'title': 'Zielona fala reliktem przeszłości?', | |
172 | 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0', | |
173 | }, | |
174 | 'playlist_count': 3, | |
175 | }, { | |
176 | 'url': 'https://trojka.polskieradio.pl/artykul/1632955', | |
177 | 'only_matching': True, | |
178 | }, { | |
179 | # with mp4 video | |
180 | 'url': 'https://trojka.polskieradio.pl/artykul/1634903', | |
181 | 'only_matching': True, | |
182 | }, { | |
183 | 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego', | |
184 | 'only_matching': True, | |
185 | }] | |
186 | ||
187 | def _real_extract(self, url): | |
188 | playlist_id = self._match_id(url) | |
189 | ||
190 | webpage = self._download_webpage(url, playlist_id) | |
191 | ||
192 | article_data = traverse_obj( | |
193 | self._search_nextjs_data(webpage, playlist_id), ( | |
194 | 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False) | |
195 | ||
196 | title = strip_or_none(article_data['title']) | |
197 | ||
198 | description = strip_or_none(article_data.get('lead')) | |
199 | ||
200 | entries = [{ | |
201 | 'url': entry['file'], | |
202 | 'ext': determine_ext(entry.get('fileName')), | |
203 | 'id': self._search_regex( | |
204 | r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), | |
205 | 'title': strip_or_none(entry.get('description')) or title, | |
206 | } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )] | |
207 | ||
208 | if not entries: | |
209 | # some legacy articles have no json attachments, but players in body | |
210 | entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, { | |
211 | 'title': title, | |
212 | }) | |
213 | ||
214 | return self.playlist_result(entries, playlist_id, title, description) | |
215 | ||
216 | ||
217 | class PolskieRadioAuditionIE(InfoExtractor): | |
218 | # new next.js sites | |
219 | IE_NAME = 'polskieradio:audition' | |
220 | _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)' | |
221 | _TESTS = [{ | |
222 | # articles, PR1 | |
223 | 'url': 'https://jedynka.polskieradio.pl/audycje/5102', | |
224 | 'info_dict': { | |
225 | 'id': '5102', | |
226 | 'title': 'Historia żywa', | |
227 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
228 | }, | |
229 | 'playlist_mincount': 38, | |
230 | }, { | |
231 | # episodes, PR1 | |
232 | 'url': 'https://jedynka.polskieradio.pl/audycje/5769', | |
233 | 'info_dict': { | |
234 | 'id': '5769', | |
235 | 'title': 'AgroFakty', | |
236 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
237 | }, | |
238 | 'playlist_mincount': 269, | |
239 | }, { | |
240 | # both episodes and articles, PR3 | |
241 | 'url': 'https://trojka.polskieradio.pl/audycja/8906', | |
242 | 'info_dict': { | |
243 | 'id': '8906', | |
244 | 'title': 'Trójka budzi', | |
245 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
246 | }, | |
247 | 'playlist_mincount': 722, | |
248 | }, { | |
249 | # some articles were "promoted to main page" and thus link to old frontend | |
250 | 'url': 'https://trojka.polskieradio.pl/audycja/305', | |
251 | 'info_dict': { | |
252 | 'id': '305', | |
253 | 'title': 'Co w mowie piszczy?', | |
254 | 'thumbnail': r're:https://static\.prsa\.pl/images/.+', | |
255 | }, | |
256 | 'playlist_count': 1523, | |
257 | }] | |
258 | ||
259 | def _call_lp3(self, path, query, video_id, note): | |
260 | return self._download_json( | |
261 | f'https://lp3test.polskieradio.pl/{path}', video_id, note, | |
262 | query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'}) | |
263 | ||
264 | def _entries(self, playlist_id, has_episodes, has_articles): | |
265 | for i in itertools.count(0) if has_episodes else []: | |
266 | page = self._call_lp3( | |
267 | 'AudioArticle/GetListByCategoryId', { | |
268 | 'categoryId': playlist_id, | |
269 | 'PageSize': 10, | |
270 | 'skip': i, | |
271 | 'format': 400, | |
272 | }, playlist_id, f'Downloading episode list page {i + 1}') | |
273 | if not traverse_obj(page, 'data'): | |
274 | break | |
275 | for episode in page['data']: | |
276 | yield { | |
277 | 'id': str(episode['id']), | |
278 | 'url': episode['file'], | |
279 | 'title': episode.get('title'), | |
280 | 'duration': int_or_none(episode.get('duration')), | |
281 | 'timestamp': parse_iso8601(episode.get('datePublic')), | |
282 | } | |
283 | ||
284 | for i in itertools.count(0) if has_articles else []: | |
285 | page = self._call_lp3( | |
286 | 'Article/GetListByCategoryId', { | |
287 | 'categoryId': playlist_id, | |
288 | 'PageSize': 9, | |
289 | 'skip': i, | |
290 | 'format': 400, | |
291 | }, playlist_id, f'Downloading article list page {i + 1}') | |
292 | if not traverse_obj(page, 'data'): | |
293 | break | |
294 | for article in page['data']: | |
295 | yield { | |
296 | '_type': 'url_transparent', | |
297 | 'id': str(article['id']), | |
298 | 'url': article['url'], | |
299 | 'title': article.get('shortTitle'), | |
300 | 'description': traverse_obj(article, ('description', 'lead')), | |
301 | 'timestamp': parse_iso8601(article.get('datePublic')), | |
302 | } | |
303 | ||
304 | def _real_extract(self, url): | |
305 | playlist_id = self._match_id(url) | |
306 | ||
307 | page_props = traverse_obj( | |
308 | self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id), | |
309 | ('props', 'pageProps', ('data', None)), get_all=False) | |
310 | ||
311 | has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios')) | |
312 | has_articles = bool(traverse_obj(page_props, 'articles')) | |
313 | ||
314 | return self.playlist_result( | |
315 | self._entries(playlist_id, has_episodes, has_articles), playlist_id, | |
316 | title=traverse_obj(page_props, ('details', 'name')), | |
317 | description=traverse_obj(page_props, ('details', 'description', 'lead')), | |
318 | thumbnail=traverse_obj(page_props, ('details', 'photo'))) | |
319 | ||
320 | ||
321 | class PolskieRadioCategoryIE(InfoExtractor): | |
322 | # legacy sites | |
323 | IE_NAME = 'polskieradio:category' | |
324 | _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)' | |
325 | _TESTS = [{ | |
326 | 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', | |
327 | 'info_dict': { | |
328 | 'id': '4143', | |
329 | 'title': 'Kierunek Kraków', | |
330 | }, | |
331 | 'playlist_mincount': 61 | |
332 | }, { | |
333 | 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', | |
334 | 'info_dict': { | |
335 | 'id': '214', | |
336 | 'title': 'Muzyka', | |
337 | }, | |
338 | 'playlist_mincount': 61 | |
339 | }, { | |
340 | # billennium tabs | |
341 | 'url': 'https://www.polskieradio.pl/8/2385', | |
342 | 'info_dict': { | |
343 | 'id': '2385', | |
344 | 'title': 'Droga przez mąkę', | |
345 | }, | |
346 | 'playlist_mincount': 111, | |
347 | }, { | |
348 | 'url': 'https://www.polskieradio.pl/10/4930', | |
349 | 'info_dict': { | |
350 | 'id': '4930', | |
351 | 'title': 'Teraz K-pop!', | |
352 | }, | |
353 | 'playlist_mincount': 392, | |
354 | }, { | |
355 | # post back pages, audio content directly without articles | |
356 | 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa', | |
357 | 'info_dict': { | |
358 | 'id': '7376', | |
359 | 'title': 'Nowa mowa', | |
360 | }, | |
361 | 'playlist_mincount': 244, | |
362 | }, { | |
363 | 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458', | |
364 | 'info_dict': { | |
365 | 'id': '175458', | |
366 | 'title': 'Krzysztof Dziuba', | |
367 | }, | |
368 | 'playlist_mincount': 420, | |
369 | }, { | |
370 | 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', | |
371 | 'only_matching': True, | |
372 | }] | |
373 | ||
374 | @classmethod | |
375 | def suitable(cls, url): | |
376 | return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url) | |
377 | ||
378 | def _entries(self, url, page, category_id): | |
379 | content = page | |
380 | is_billennium_tabs = 'onclick="TB_LoadTab(' in page | |
381 | is_post_back = 'onclick="__doPostBack(' in page | |
382 | pagination = page if is_billennium_tabs else None | |
383 | for page_num in itertools.count(2): | |
384 | for a_entry, entry_id in re.findall( | |
385 | r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', | |
386 | content): | |
387 | entry = extract_attributes(a_entry) | |
388 | if entry.get('href'): | |
389 | yield self.url_result( | |
390 | urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title')) | |
391 | for a_entry in re.findall(r'<span data-media=({[^ ]+})', content): | |
392 | yield traverse_obj(self._parse_json(a_entry, category_id), { | |
393 | 'url': 'file', | |
394 | 'id': 'uid', | |
395 | 'duration': 'length', | |
396 | 'title': ('title', {urllib.parse.unquote}), | |
397 | 'description': ('desc', {urllib.parse.unquote}), | |
398 | }) | |
399 | if is_billennium_tabs: | |
400 | params = self._search_json( | |
401 | r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(', | |
402 | pagination, 'next page params', category_id, default=None, close_objects=1, | |
403 | contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x))) | |
404 | if not params: | |
405 | break | |
406 | tab_content = self._download_json( | |
407 | 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', | |
408 | category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, | |
409 | data=json.dumps(dict(zip(( | |
410 | 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', | |
411 | 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', | |
412 | 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber' | |
413 | ), params))).encode())['d'] | |
414 | content, pagination = tab_content['Content'], tab_content.get('PagerContent') | |
415 | elif is_post_back: | |
416 | target = self._search_regex( | |
417 | r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)', | |
418 | content, 'pagination postback target', group='target', default=None) | |
419 | if not target: | |
420 | break | |
421 | content = self._download_webpage( | |
422 | url, category_id, f'Downloading page {page_num}', | |
423 | data=urllib.parse.urlencode({ | |
424 | **self._hidden_inputs(content), | |
425 | '__EVENTTARGET': target, | |
426 | '__EVENTARGUMENT': 'Next', | |
427 | }).encode()) | |
428 | else: | |
429 | next_url = urljoin(url, self._search_regex( | |
430 | r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', | |
431 | content, 'next page url', group='url', default=None)) | |
432 | if not next_url: | |
433 | break | |
434 | content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}') | |
435 | ||
436 | def _real_extract(self, url): | |
437 | category_id = self._match_id(url) | |
438 | webpage, urlh = self._download_webpage_handle(url, category_id) | |
439 | if PolskieRadioAuditionIE.suitable(urlh.url): | |
440 | return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) | |
441 | title = self._html_search_regex( | |
442 | r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)</title>', | |
443 | webpage, 'title', fatal=False) | |
444 | return self.playlist_result( | |
445 | self._entries(url, webpage, category_id), | |
446 | category_id, title) | |
447 | ||
448 | ||
449 | class PolskieRadioPlayerIE(InfoExtractor): | |
450 | IE_NAME = 'polskieradio:player' | |
451 | _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)' | |
452 | ||
453 | _BASE_URL = 'https://player.polskieradio.pl' | |
454 | _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' | |
455 | _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' | |
456 | ||
457 | _TESTS = [{ | |
458 | 'url': 'https://player.polskieradio.pl/anteny/trojka', | |
459 | 'info_dict': { | |
460 | 'id': '3', | |
461 | 'ext': 'm4a', | |
462 | 'title': 'Trójka', | |
463 | }, | |
464 | 'params': { | |
465 | 'format': 'bestaudio', | |
466 | 'skip_download': 'endless stream', | |
467 | }, | |
468 | }] | |
469 | ||
470 | def _get_channel_list(self, channel_url='no_channel'): | |
471 | player_code = self._download_webpage( | |
472 | self._PLAYER_URL, channel_url, | |
473 | note='Downloading js player') | |
474 | channel_list = js_to_json(self._search_regex( | |
475 | r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) | |
476 | return self._parse_json(channel_list, channel_url) | |
477 | ||
478 | def _real_extract(self, url): | |
479 | channel_url = self._match_id(url) | |
480 | channel_list = self._get_channel_list(channel_url) | |
481 | ||
482 | channel = next((c for c in channel_list if c.get('url') == channel_url), None) | |
483 | ||
484 | if not channel: | |
485 | raise ExtractorError('Channel not found') | |
486 | ||
487 | station_list = self._download_json(self._STATIONS_API_URL, channel_url, | |
488 | note='Downloading stream url list', | |
489 | headers={ | |
490 | 'Accept': 'application/json', | |
491 | 'Referer': url, | |
492 | 'Origin': self._BASE_URL, | |
493 | }) | |
494 | station = next((s for s in station_list | |
495 | if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) | |
496 | if not station: | |
497 | raise ExtractorError('Station not found even though we extracted channel') | |
498 | ||
499 | formats = [] | |
500 | for stream_url in station['Streams']: | |
501 | stream_url = self._proto_relative_url(stream_url) | |
502 | if stream_url.endswith('/playlist.m3u8'): | |
503 | formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) | |
504 | elif stream_url.endswith('/manifest.f4m'): | |
505 | formats.extend(self._extract_mpd_formats(stream_url, channel_url)) | |
506 | elif stream_url.endswith('/Manifest'): | |
507 | formats.extend(self._extract_ism_formats(stream_url, channel_url)) | |
508 | else: | |
509 | formats.append({ | |
510 | 'url': stream_url, | |
511 | }) | |
512 | ||
513 | return { | |
514 | 'id': compat_str(channel['id']), | |
515 | 'formats': formats, | |
516 | 'title': channel.get('name') or channel.get('streamName'), | |
517 | 'display_id': channel_url, | |
518 | 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', | |
519 | 'is_live': True, | |
520 | } | |
521 | ||
522 | ||
523 | class PolskieRadioPodcastBaseExtractor(InfoExtractor): | |
524 | _API_BASE = 'https://apipodcasts.polskieradio.pl/api' | |
525 | ||
526 | def _parse_episode(self, data): | |
527 | return { | |
528 | 'id': data['guid'], | |
529 | 'formats': [{ | |
530 | 'url': data['url'], | |
531 | 'filesize': int_or_none(data.get('fileSize')), | |
532 | }], | |
533 | 'title': data['title'], | |
534 | 'description': data.get('description'), | |
535 | 'duration': int_or_none(data.get('length')), | |
536 | 'timestamp': parse_iso8601(data.get('publishDate')), | |
537 | 'thumbnail': url_or_none(data.get('image')), | |
538 | 'series': data.get('podcastTitle'), | |
539 | 'episode': data['title'], | |
540 | } | |
541 | ||
542 | ||
543 | class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): | |
544 | IE_NAME = 'polskieradio:podcast:list' | |
545 | _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)' | |
546 | _TESTS = [{ | |
547 | 'url': 'https://podcasty.polskieradio.pl/podcast/8/', | |
548 | 'info_dict': { | |
549 | 'id': '8', | |
550 | 'title': 'Śniadanie w Trójce', | |
551 | 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', | |
552 | 'uploader': 'Beata Michniewicz', | |
553 | }, | |
554 | 'playlist_mincount': 714, | |
555 | }] | |
556 | _PAGE_SIZE = 10 | |
557 | ||
558 | def _call_api(self, podcast_id, page): | |
559 | return self._download_json( | |
560 | f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', | |
561 | podcast_id, f'Downloading page {page}') | |
562 | ||
563 | def _real_extract(self, url): | |
564 | podcast_id = self._match_id(url) | |
565 | data = self._call_api(podcast_id, 1) | |
566 | ||
567 | def get_page(page_num): | |
568 | page_data = self._call_api(podcast_id, page_num + 1) if page_num else data | |
569 | yield from (self._parse_episode(ep) for ep in page_data['items']) | |
570 | ||
571 | return { | |
572 | '_type': 'playlist', | |
573 | 'entries': InAdvancePagedList( | |
574 | get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), | |
575 | 'id': str(data['id']), | |
576 | 'title': data.get('title'), | |
577 | 'description': data.get('description'), | |
578 | 'uploader': data.get('announcer'), | |
579 | } | |
580 | ||
581 | ||
582 | class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): | |
583 | IE_NAME = 'polskieradio:podcast' | |
584 | _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' | |
585 | _TESTS = [{ | |
586 | 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', | |
587 | 'info_dict': { | |
588 | 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', | |
589 | 'ext': 'mp3', | |
590 | 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', | |
591 | 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', | |
592 | 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?', | |
593 | 'duration': 2893, | |
594 | 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg', | |
595 | 'series': 'Raport o stanie świata', | |
596 | }, | |
597 | }] | |
598 | ||
599 | def _real_extract(self, url): | |
600 | podcast_id = self._match_id(url) | |
601 | data = self._download_json( | |
602 | f'{self._API_BASE}/audio', | |
603 | podcast_id, 'Downloading podcast metadata', | |
604 | data=json.dumps({ | |
605 | 'guids': [podcast_id], | |
606 | }).encode('utf-8'), | |
607 | headers={ | |
608 | 'Content-Type': 'application/json', | |
609 | }) | |
610 | return self._parse_episode(data[0]) |