]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/polskieradio.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / polskieradio.py
CommitLineData
84a18e9b 1import itertools
89fcdff5
LL
2import json
3import math
0463b77a 4import re
2d185706 5
0463b77a
S
6from .common import InfoExtractor
7from ..compat import (
8 compat_str,
9 compat_urllib_parse_unquote,
8d3737cd 10 compat_urlparse
0463b77a
S
11)
12from ..utils import (
84a18e9b 13 extract_attributes,
89fcdff5
LL
14 ExtractorError,
15 InAdvancePagedList,
0463b77a 16 int_or_none,
89fcdff5
LL
17 js_to_json,
18 parse_iso8601,
0463b77a
S
19 strip_or_none,
20 unified_timestamp,
9a133454 21 unescapeHTML,
89fcdff5 22 url_or_none,
0463b77a 23)
2d185706
JAW
24
25
89fcdff5
LL
26class PolskieRadioBaseExtractor(InfoExtractor):
27 def _extract_webpage_player_entries(self, webpage, playlist_id, base_data):
28 media_urls = set()
29
30 for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage):
31 media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False)
32 if not media.get('file') or not media.get('desc'):
33 continue
34 media_url = self._proto_relative_url(media['file'])
35 if media_url in media_urls:
36 continue
37 media_urls.add(media_url)
38 entry = base_data.copy()
39 entry.update({
40 'id': compat_str(media['id']),
41 'url': media_url,
42 'duration': int_or_none(media.get('length')),
43 'vcodec': 'none' if media.get('provider') == 'audio' else None,
44 })
45 entry_title = compat_urllib_parse_unquote(media['desc'])
46 if entry_title:
47 entry['title'] = entry_title
48 yield entry
49
50
51class PolskieRadioIE(PolskieRadioBaseExtractor):
52 _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
9a133454 53 _TESTS = [{ # Old-style single broadcast.
2d185706 54 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
2d185706
JAW
55 'info_dict': {
56 'id': '1587943',
2d185706
JAW
57 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
58 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
0463b77a
S
59 },
60 'playlist': [{
61 'md5': '2984ee6ce9046d91fc233bc1a864a09a',
62 'info_dict': {
63 'id': '1540576',
64 'ext': 'mp3',
65 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
66 'timestamp': 1456594200,
67 'upload_date': '20160227',
68 'duration': 2364,
ec85ded8 69 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
0463b77a
S
70 },
71 }],
9a133454 72 }, { # New-style single broadcast.
73 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo',
74 'info_dict': {
75 'id': '2534482',
76 'title': 'Żagaryści. Poezja jak spoiwo',
77 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695',
78 },
79 'playlist': [{
80 'md5': 'd07559829f61d5a93a75755987ded760',
81 'info_dict': {
82 'id': '2516679',
83 'ext': 'mp3',
84 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c',
85 'timestamp': 1592654400,
86 'upload_date': '20200620',
87 'duration': 1430,
88 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
89 },
90 }],
89fcdff5
LL
91 }, {
92 # PR4 audition - other frontend
93 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301',
2d185706 94 'info_dict': {
89fcdff5
LL
95 'id': '2610977',
96 'ext': 'mp3',
97 'title': 'Pogłos 29 października godz. 23:01',
0463b77a 98 },
0463b77a
S
99 }, {
100 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
101 'only_matching': True,
102 }, {
103 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
104 'only_matching': True,
ce96ed05
S
105 }, {
106 # with mp4 video
107 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
108 'only_matching': True,
89fcdff5
LL
109 }, {
110 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci',
111 'only_matching': True,
2d185706
JAW
112 }]
113
114 def _real_extract(self, url):
0463b77a
S
115 playlist_id = self._match_id(url)
116
117 webpage = self._download_webpage(url, playlist_id)
118
119 content = self._search_regex(
b230fefc 120 r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
89fcdff5 121 webpage, 'content', default=None)
0463b77a
S
122
123 timestamp = unified_timestamp(self._html_search_regex(
124 r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
89fcdff5 125 webpage, 'timestamp', default=None))
2d185706 126
89fcdff5 127 thumbnail_url = self._og_search_thumbnail(webpage, default=None)
e2d616dd 128
89fcdff5 129 title = self._og_search_title(webpage).strip()
2d185706 130
89fcdff5
LL
131 description = strip_or_none(self._og_search_description(webpage, default=None))
132 description = description.replace('\xa0', ' ') if description is not None else None
2d185706 133
89fcdff5
LL
134 if not content:
135 return {
136 'id': playlist_id,
137 'url': self._proto_relative_url(
138 self._search_regex(
139 r"source:\s*'(//static\.prsa\.pl/[^']+)'",
140 webpage, 'audition record url')),
141 'title': title,
142 'description': description,
0463b77a 143 'timestamp': timestamp,
89fcdff5
LL
144 'thumbnail': thumbnail_url,
145 }
2d185706 146
89fcdff5
LL
147 entries = self._extract_webpage_player_entries(content, playlist_id, {
148 'title': title,
149 'timestamp': timestamp,
150 'thumbnail': thumbnail_url,
151 })
2d185706 152
0463b77a 153 return self.playlist_result(entries, playlist_id, title, description)
84a18e9b
S
154
155
156class PolskieRadioCategoryIE(InfoExtractor):
157 _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)'
158 _TESTS = [{
159 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
160 'info_dict': {
161 'id': '5102',
162 'title': 'HISTORIA ŻYWA',
163 },
164 'playlist_mincount': 38,
165 }, {
166 'url': 'http://www.polskieradio.pl/7/4807',
167 'info_dict': {
168 'id': '4807',
169 'title': 'Vademecum 1050. rocznicy Chrztu Polski'
170 },
171 'playlist_mincount': 5
172 }, {
173 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
174 'only_matching': True
175 }, {
176 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
177 'info_dict': {
178 'id': '4143',
179 'title': 'Kierunek Kraków',
180 },
181 'playlist_mincount': 61
182 }, {
183 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka',
184 'info_dict': {
185 'id': '214',
186 'title': 'Muzyka',
187 },
188 'playlist_mincount': 61
189 }, {
190 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
191 'only_matching': True,
192 }, {
193 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka',
194 'only_matching': True,
195 }]
196
197 @classmethod
198 def suitable(cls, url):
199 return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url)
200
201 def _entries(self, url, page, category_id):
202 content = page
203 for page_num in itertools.count(2):
204 for a_entry, entry_id in re.findall(
205 r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>',
206 content):
207 entry = extract_attributes(a_entry)
208 href = entry.get('href')
209 if not href:
210 continue
211 yield self.url_result(
212 compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(),
213 entry_id, entry.get('title'))
214 mobj = re.search(
215 r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1',
216 content)
217 if not mobj:
218 break
219 next_url = compat_urlparse.urljoin(url, mobj.group('url'))
220 content = self._download_webpage(
221 next_url, category_id, 'Downloading page %s' % page_num)
222
223 def _real_extract(self, url):
224 category_id = self._match_id(url)
225 webpage = self._download_webpage(url, category_id)
226 title = self._html_search_regex(
227 r'<title>([^<]+) - [^<]+ - [^<]+</title>',
228 webpage, 'title', fatal=False)
229 return self.playlist_result(
230 self._entries(url, webpage, category_id),
231 category_id, title)
89fcdff5
LL
232
233
234class PolskieRadioPlayerIE(InfoExtractor):
235 IE_NAME = 'polskieradio:player'
236 _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)'
237
238 _BASE_URL = 'https://player.polskieradio.pl'
239 _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js'
240 _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje'
241
242 _TESTS = [{
243 'url': 'https://player.polskieradio.pl/anteny/trojka',
244 'info_dict': {
245 'id': '3',
246 'ext': 'm4a',
247 'title': 'Trójka',
248 },
249 'params': {
250 'format': 'bestaudio',
251 'skip_download': 'endless stream',
252 },
253 }]
254
255 def _get_channel_list(self, channel_url='no_channel'):
256 player_code = self._download_webpage(
257 self._PLAYER_URL, channel_url,
258 note='Downloading js player')
259 channel_list = js_to_json(self._search_regex(
260 r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list'))
261 return self._parse_json(channel_list, channel_url)
262
263 def _real_extract(self, url):
264 channel_url = self._match_id(url)
265 channel_list = self._get_channel_list(channel_url)
266
267 channel = next((c for c in channel_list if c.get('url') == channel_url), None)
268
269 if not channel:
270 raise ExtractorError('Channel not found')
271
272 station_list = self._download_json(self._STATIONS_API_URL, channel_url,
273 note='Downloading stream url list',
274 headers={
275 'Accept': 'application/json',
276 'Referer': url,
277 'Origin': self._BASE_URL,
278 })
279 station = next((s for s in station_list
280 if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None)
281 if not station:
282 raise ExtractorError('Station not found even though we extracted channel')
283
284 formats = []
285 for stream_url in station['Streams']:
286 stream_url = self._proto_relative_url(stream_url)
287 if stream_url.endswith('/playlist.m3u8'):
288 formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True))
289 elif stream_url.endswith('/manifest.f4m'):
290 formats.extend(self._extract_mpd_formats(stream_url, channel_url))
291 elif stream_url.endswith('/Manifest'):
292 formats.extend(self._extract_ism_formats(stream_url, channel_url))
293 else:
294 formats.append({
295 'url': stream_url,
296 })
297
89fcdff5
LL
298 return {
299 'id': compat_str(channel['id']),
300 'formats': formats,
301 'title': channel.get('name') or channel.get('streamName'),
302 'display_id': channel_url,
303 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png',
304 'is_live': True,
305 }
306
307
308class PolskieRadioPodcastBaseExtractor(InfoExtractor):
309 _API_BASE = 'https://apipodcasts.polskieradio.pl/api'
310
311 def _parse_episode(self, data):
312 return {
313 'id': data['guid'],
314 'formats': [{
315 'url': data['url'],
316 'filesize': int_or_none(data.get('fileSize')),
317 }],
318 'title': data['title'],
319 'description': data.get('description'),
320 'duration': int_or_none(data.get('length')),
321 'timestamp': parse_iso8601(data.get('publishDate')),
322 'thumbnail': url_or_none(data.get('image')),
323 'series': data.get('podcastTitle'),
324 'episode': data['title'],
325 }
326
327
328class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
329 IE_NAME = 'polskieradio:podcast:list'
330 _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)'
331 _TESTS = [{
332 'url': 'https://podcasty.polskieradio.pl/podcast/8/',
333 'info_dict': {
334 'id': '8',
335 'title': 'Śniadanie w Trójce',
336 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef',
337 'uploader': 'Beata Michniewicz',
338 },
339 'playlist_mincount': 714,
340 }]
341 _PAGE_SIZE = 10
342
343 def _call_api(self, podcast_id, page):
344 return self._download_json(
345 f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}',
346 podcast_id, f'Downloading page {page}')
347
348 def _real_extract(self, url):
349 podcast_id = self._match_id(url)
350 data = self._call_api(podcast_id, 1)
351
352 def get_page(page_num):
353 page_data = self._call_api(podcast_id, page_num + 1) if page_num else data
354 yield from (self._parse_episode(ep) for ep in page_data['items'])
355
356 return {
357 '_type': 'playlist',
358 'entries': InAdvancePagedList(
359 get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE),
360 'id': str(data['id']),
361 'title': data['title'],
362 'description': data.get('description'),
363 'uploader': data.get('announcer'),
364 }
365
366
367class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
368 IE_NAME = 'polskieradio:podcast'
369 _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})'
370 _TESTS = [{
371 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32',
372 'info_dict': {
373 'id': '6eafe403-cb8f-4756-b896-4455c3713c32',
374 'ext': 'mp3',
375 'title': 'Theresa May rezygnuje. Co dalej z brexitem?',
376 'description': 'md5:e41c409a29d022b70ef0faa61dbded60',
377 },
378 }]
379
380 def _real_extract(self, url):
381 podcast_id = self._match_id(url)
382 data = self._download_json(
383 f'{self._API_BASE}/audio',
384 podcast_id, 'Downloading podcast metadata',
385 data=json.dumps({
386 'guids': [podcast_id],
387 }).encode('utf-8'),
388 headers={
389 'Content-Type': 'application/json',
390 })
391 return self._parse_episode(data[0])
392
393
394class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor):
395 _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)'
396 IE_NAME = 'polskieradio:kierowcow'
397
398 _TESTS = [{
399 'url': 'https://radiokierowcow.pl/artykul/2694529',
400 'info_dict': {
401 'id': '2694529',
402 'title': 'Zielona fala reliktem przeszłości?',
403 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2',
404 },
405 'playlist_count': 3,
406 }]
407
408 def _real_extract(self, url):
409 media_id = self._match_id(url)
410 webpage = self._download_webpage(url, media_id)
411 nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId']
412 article = self._download_json(
413 f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}',
414 media_id)
415 data = article['pageProps']['data']
416 title = data['title']
417 entries = self._extract_webpage_player_entries(data['content'], media_id, {
418 'title': title,
419 })
420
421 return {
422 '_type': 'playlist',
423 'id': media_id,
424 'entries': entries,
425 'title': title,
426 'description': data.get('lead'),
427 }