]>
Commit | Line | Data |
---|---|---|
84a18e9b | 1 | import itertools |
89fcdff5 LL |
2 | import json |
3 | import math | |
0463b77a | 4 | import re |
2d185706 | 5 | |
0463b77a S |
6 | from .common import InfoExtractor |
7 | from ..compat import ( | |
8 | compat_str, | |
9 | compat_urllib_parse_unquote, | |
8d3737cd | 10 | compat_urlparse |
0463b77a S |
11 | ) |
12 | from ..utils import ( | |
84a18e9b | 13 | extract_attributes, |
89fcdff5 LL |
14 | ExtractorError, |
15 | InAdvancePagedList, | |
0463b77a | 16 | int_or_none, |
89fcdff5 LL |
17 | js_to_json, |
18 | parse_iso8601, | |
0463b77a S |
19 | strip_or_none, |
20 | unified_timestamp, | |
9a133454 | 21 | unescapeHTML, |
89fcdff5 | 22 | url_or_none, |
0463b77a | 23 | ) |
2d185706 JAW |
24 | |
25 | ||
89fcdff5 LL |
26 | class PolskieRadioBaseExtractor(InfoExtractor): |
27 | def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): | |
28 | media_urls = set() | |
29 | ||
30 | for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): | |
31 | media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) | |
32 | if not media.get('file') or not media.get('desc'): | |
33 | continue | |
34 | media_url = self._proto_relative_url(media['file']) | |
35 | if media_url in media_urls: | |
36 | continue | |
37 | media_urls.add(media_url) | |
38 | entry = base_data.copy() | |
39 | entry.update({ | |
40 | 'id': compat_str(media['id']), | |
41 | 'url': media_url, | |
42 | 'duration': int_or_none(media.get('length')), | |
43 | 'vcodec': 'none' if media.get('provider') == 'audio' else None, | |
44 | }) | |
45 | entry_title = compat_urllib_parse_unquote(media['desc']) | |
46 | if entry_title: | |
47 | entry['title'] = entry_title | |
48 | yield entry | |
49 | ||
50 | ||
51 | class PolskieRadioIE(PolskieRadioBaseExtractor): | |
52 | _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' | |
9a133454 | 53 | _TESTS = [{ # Old-style single broadcast. |
2d185706 | 54 | 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', |
2d185706 JAW |
55 | 'info_dict': { |
56 | 'id': '1587943', | |
2d185706 JAW |
57 | 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', |
58 | 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', | |
0463b77a S |
59 | }, |
60 | 'playlist': [{ | |
61 | 'md5': '2984ee6ce9046d91fc233bc1a864a09a', | |
62 | 'info_dict': { | |
63 | 'id': '1540576', | |
64 | 'ext': 'mp3', | |
65 | 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', | |
66 | 'timestamp': 1456594200, | |
67 | 'upload_date': '20160227', | |
68 | 'duration': 2364, | |
ec85ded8 | 69 | 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' |
0463b77a S |
70 | }, |
71 | }], | |
9a133454 | 72 | }, { # New-style single broadcast. |
73 | 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', | |
74 | 'info_dict': { | |
75 | 'id': '2534482', | |
76 | 'title': 'Żagaryści. Poezja jak spoiwo', | |
77 | 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695', | |
78 | }, | |
79 | 'playlist': [{ | |
80 | 'md5': 'd07559829f61d5a93a75755987ded760', | |
81 | 'info_dict': { | |
82 | 'id': '2516679', | |
83 | 'ext': 'mp3', | |
84 | 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c', | |
85 | 'timestamp': 1592654400, | |
86 | 'upload_date': '20200620', | |
87 | 'duration': 1430, | |
88 | 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' | |
89 | }, | |
90 | }], | |
89fcdff5 LL |
91 | }, { |
92 | # PR4 audition - other frontend | |
93 | 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', | |
2d185706 | 94 | 'info_dict': { |
89fcdff5 LL |
95 | 'id': '2610977', |
96 | 'ext': 'mp3', | |
97 | 'title': 'Pogłos 29 października godz. 23:01', | |
0463b77a | 98 | }, |
0463b77a S |
99 | }, { |
100 | 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', | |
101 | 'only_matching': True, | |
102 | }, { | |
103 | 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', | |
104 | 'only_matching': True, | |
ce96ed05 S |
105 | }, { |
106 | # with mp4 video | |
107 | 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', | |
108 | 'only_matching': True, | |
89fcdff5 LL |
109 | }, { |
110 | 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', | |
111 | 'only_matching': True, | |
2d185706 JAW |
112 | }] |
113 | ||
114 | def _real_extract(self, url): | |
0463b77a S |
115 | playlist_id = self._match_id(url) |
116 | ||
117 | webpage = self._download_webpage(url, playlist_id) | |
118 | ||
119 | content = self._search_regex( | |
b230fefc | 120 | r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', |
89fcdff5 | 121 | webpage, 'content', default=None) |
0463b77a S |
122 | |
123 | timestamp = unified_timestamp(self._html_search_regex( | |
124 | r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', | |
89fcdff5 | 125 | webpage, 'timestamp', default=None)) |
2d185706 | 126 | |
89fcdff5 | 127 | thumbnail_url = self._og_search_thumbnail(webpage, default=None) |
e2d616dd | 128 | |
89fcdff5 | 129 | title = self._og_search_title(webpage).strip() |
2d185706 | 130 | |
89fcdff5 LL |
131 | description = strip_or_none(self._og_search_description(webpage, default=None)) |
132 | description = description.replace('\xa0', ' ') if description is not None else None | |
2d185706 | 133 | |
89fcdff5 LL |
134 | if not content: |
135 | return { | |
136 | 'id': playlist_id, | |
137 | 'url': self._proto_relative_url( | |
138 | self._search_regex( | |
139 | r"source:\s*'(//static\.prsa\.pl/[^']+)'", | |
140 | webpage, 'audition record url')), | |
141 | 'title': title, | |
142 | 'description': description, | |
0463b77a | 143 | 'timestamp': timestamp, |
89fcdff5 LL |
144 | 'thumbnail': thumbnail_url, |
145 | } | |
2d185706 | 146 | |
89fcdff5 LL |
147 | entries = self._extract_webpage_player_entries(content, playlist_id, { |
148 | 'title': title, | |
149 | 'timestamp': timestamp, | |
150 | 'thumbnail': thumbnail_url, | |
151 | }) | |
2d185706 | 152 | |
0463b77a | 153 | return self.playlist_result(entries, playlist_id, title, description) |
84a18e9b S |
154 | |
155 | ||
156 | class PolskieRadioCategoryIE(InfoExtractor): | |
157 | _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' | |
158 | _TESTS = [{ | |
159 | 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', | |
160 | 'info_dict': { | |
161 | 'id': '5102', | |
162 | 'title': 'HISTORIA ŻYWA', | |
163 | }, | |
164 | 'playlist_mincount': 38, | |
165 | }, { | |
166 | 'url': 'http://www.polskieradio.pl/7/4807', | |
167 | 'info_dict': { | |
168 | 'id': '4807', | |
169 | 'title': 'Vademecum 1050. rocznicy Chrztu Polski' | |
170 | }, | |
171 | 'playlist_mincount': 5 | |
172 | }, { | |
173 | 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', | |
174 | 'only_matching': True | |
175 | }, { | |
176 | 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', | |
177 | 'info_dict': { | |
178 | 'id': '4143', | |
179 | 'title': 'Kierunek Kraków', | |
180 | }, | |
181 | 'playlist_mincount': 61 | |
182 | }, { | |
183 | 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', | |
184 | 'info_dict': { | |
185 | 'id': '214', | |
186 | 'title': 'Muzyka', | |
187 | }, | |
188 | 'playlist_mincount': 61 | |
189 | }, { | |
190 | 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', | |
191 | 'only_matching': True, | |
192 | }, { | |
193 | 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', | |
194 | 'only_matching': True, | |
195 | }] | |
196 | ||
197 | @classmethod | |
198 | def suitable(cls, url): | |
199 | return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) | |
200 | ||
201 | def _entries(self, url, page, category_id): | |
202 | content = page | |
203 | for page_num in itertools.count(2): | |
204 | for a_entry, entry_id in re.findall( | |
205 | r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', | |
206 | content): | |
207 | entry = extract_attributes(a_entry) | |
208 | href = entry.get('href') | |
209 | if not href: | |
210 | continue | |
211 | yield self.url_result( | |
212 | compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), | |
213 | entry_id, entry.get('title')) | |
214 | mobj = re.search( | |
215 | r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', | |
216 | content) | |
217 | if not mobj: | |
218 | break | |
219 | next_url = compat_urlparse.urljoin(url, mobj.group('url')) | |
220 | content = self._download_webpage( | |
221 | next_url, category_id, 'Downloading page %s' % page_num) | |
222 | ||
223 | def _real_extract(self, url): | |
224 | category_id = self._match_id(url) | |
225 | webpage = self._download_webpage(url, category_id) | |
226 | title = self._html_search_regex( | |
227 | r'<title>([^<]+) - [^<]+ - [^<]+</title>', | |
228 | webpage, 'title', fatal=False) | |
229 | return self.playlist_result( | |
230 | self._entries(url, webpage, category_id), | |
231 | category_id, title) | |
89fcdff5 LL |
232 | |
233 | ||
234 | class PolskieRadioPlayerIE(InfoExtractor): | |
235 | IE_NAME = 'polskieradio:player' | |
236 | _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)' | |
237 | ||
238 | _BASE_URL = 'https://player.polskieradio.pl' | |
239 | _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' | |
240 | _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' | |
241 | ||
242 | _TESTS = [{ | |
243 | 'url': 'https://player.polskieradio.pl/anteny/trojka', | |
244 | 'info_dict': { | |
245 | 'id': '3', | |
246 | 'ext': 'm4a', | |
247 | 'title': 'Trójka', | |
248 | }, | |
249 | 'params': { | |
250 | 'format': 'bestaudio', | |
251 | 'skip_download': 'endless stream', | |
252 | }, | |
253 | }] | |
254 | ||
255 | def _get_channel_list(self, channel_url='no_channel'): | |
256 | player_code = self._download_webpage( | |
257 | self._PLAYER_URL, channel_url, | |
258 | note='Downloading js player') | |
259 | channel_list = js_to_json(self._search_regex( | |
260 | r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) | |
261 | return self._parse_json(channel_list, channel_url) | |
262 | ||
263 | def _real_extract(self, url): | |
264 | channel_url = self._match_id(url) | |
265 | channel_list = self._get_channel_list(channel_url) | |
266 | ||
267 | channel = next((c for c in channel_list if c.get('url') == channel_url), None) | |
268 | ||
269 | if not channel: | |
270 | raise ExtractorError('Channel not found') | |
271 | ||
272 | station_list = self._download_json(self._STATIONS_API_URL, channel_url, | |
273 | note='Downloading stream url list', | |
274 | headers={ | |
275 | 'Accept': 'application/json', | |
276 | 'Referer': url, | |
277 | 'Origin': self._BASE_URL, | |
278 | }) | |
279 | station = next((s for s in station_list | |
280 | if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) | |
281 | if not station: | |
282 | raise ExtractorError('Station not found even though we extracted channel') | |
283 | ||
284 | formats = [] | |
285 | for stream_url in station['Streams']: | |
286 | stream_url = self._proto_relative_url(stream_url) | |
287 | if stream_url.endswith('/playlist.m3u8'): | |
288 | formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) | |
289 | elif stream_url.endswith('/manifest.f4m'): | |
290 | formats.extend(self._extract_mpd_formats(stream_url, channel_url)) | |
291 | elif stream_url.endswith('/Manifest'): | |
292 | formats.extend(self._extract_ism_formats(stream_url, channel_url)) | |
293 | else: | |
294 | formats.append({ | |
295 | 'url': stream_url, | |
296 | }) | |
297 | ||
89fcdff5 LL |
298 | return { |
299 | 'id': compat_str(channel['id']), | |
300 | 'formats': formats, | |
301 | 'title': channel.get('name') or channel.get('streamName'), | |
302 | 'display_id': channel_url, | |
303 | 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', | |
304 | 'is_live': True, | |
305 | } | |
306 | ||
307 | ||
308 | class PolskieRadioPodcastBaseExtractor(InfoExtractor): | |
309 | _API_BASE = 'https://apipodcasts.polskieradio.pl/api' | |
310 | ||
311 | def _parse_episode(self, data): | |
312 | return { | |
313 | 'id': data['guid'], | |
314 | 'formats': [{ | |
315 | 'url': data['url'], | |
316 | 'filesize': int_or_none(data.get('fileSize')), | |
317 | }], | |
318 | 'title': data['title'], | |
319 | 'description': data.get('description'), | |
320 | 'duration': int_or_none(data.get('length')), | |
321 | 'timestamp': parse_iso8601(data.get('publishDate')), | |
322 | 'thumbnail': url_or_none(data.get('image')), | |
323 | 'series': data.get('podcastTitle'), | |
324 | 'episode': data['title'], | |
325 | } | |
326 | ||
327 | ||
328 | class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): | |
329 | IE_NAME = 'polskieradio:podcast:list' | |
330 | _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)' | |
331 | _TESTS = [{ | |
332 | 'url': 'https://podcasty.polskieradio.pl/podcast/8/', | |
333 | 'info_dict': { | |
334 | 'id': '8', | |
335 | 'title': 'Śniadanie w Trójce', | |
336 | 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', | |
337 | 'uploader': 'Beata Michniewicz', | |
338 | }, | |
339 | 'playlist_mincount': 714, | |
340 | }] | |
341 | _PAGE_SIZE = 10 | |
342 | ||
343 | def _call_api(self, podcast_id, page): | |
344 | return self._download_json( | |
345 | f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', | |
346 | podcast_id, f'Downloading page {page}') | |
347 | ||
348 | def _real_extract(self, url): | |
349 | podcast_id = self._match_id(url) | |
350 | data = self._call_api(podcast_id, 1) | |
351 | ||
352 | def get_page(page_num): | |
353 | page_data = self._call_api(podcast_id, page_num + 1) if page_num else data | |
354 | yield from (self._parse_episode(ep) for ep in page_data['items']) | |
355 | ||
356 | return { | |
357 | '_type': 'playlist', | |
358 | 'entries': InAdvancePagedList( | |
359 | get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), | |
360 | 'id': str(data['id']), | |
361 | 'title': data['title'], | |
362 | 'description': data.get('description'), | |
363 | 'uploader': data.get('announcer'), | |
364 | } | |
365 | ||
366 | ||
367 | class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): | |
368 | IE_NAME = 'polskieradio:podcast' | |
369 | _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' | |
370 | _TESTS = [{ | |
371 | 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', | |
372 | 'info_dict': { | |
373 | 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', | |
374 | 'ext': 'mp3', | |
375 | 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', | |
376 | 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', | |
377 | }, | |
378 | }] | |
379 | ||
380 | def _real_extract(self, url): | |
381 | podcast_id = self._match_id(url) | |
382 | data = self._download_json( | |
383 | f'{self._API_BASE}/audio', | |
384 | podcast_id, 'Downloading podcast metadata', | |
385 | data=json.dumps({ | |
386 | 'guids': [podcast_id], | |
387 | }).encode('utf-8'), | |
388 | headers={ | |
389 | 'Content-Type': 'application/json', | |
390 | }) | |
391 | return self._parse_episode(data[0]) | |
392 | ||
393 | ||
394 | class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): | |
395 | _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)' | |
396 | IE_NAME = 'polskieradio:kierowcow' | |
397 | ||
398 | _TESTS = [{ | |
399 | 'url': 'https://radiokierowcow.pl/artykul/2694529', | |
400 | 'info_dict': { | |
401 | 'id': '2694529', | |
402 | 'title': 'Zielona fala reliktem przeszłości?', | |
403 | 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', | |
404 | }, | |
405 | 'playlist_count': 3, | |
406 | }] | |
407 | ||
408 | def _real_extract(self, url): | |
409 | media_id = self._match_id(url) | |
410 | webpage = self._download_webpage(url, media_id) | |
411 | nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] | |
412 | article = self._download_json( | |
413 | f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', | |
414 | media_id) | |
415 | data = article['pageProps']['data'] | |
416 | title = data['title'] | |
417 | entries = self._extract_webpage_player_entries(data['content'], media_id, { | |
418 | 'title': title, | |
419 | }) | |
420 | ||
421 | return { | |
422 | '_type': 'playlist', | |
423 | 'id': media_id, | |
424 | 'entries': entries, | |
425 | 'title': title, | |
426 | 'description': data.get('lead'), | |
427 | } |