]>
Commit | Line | Data |
---|---|---|
2d185706 JAW |
1 | # coding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
84a18e9b | 4 | import itertools |
0463b77a | 5 | import re |
2d185706 | 6 | |
0463b77a S |
7 | from .common import InfoExtractor |
8 | from ..compat import ( | |
9 | compat_str, | |
10 | compat_urllib_parse_unquote, | |
8d3737cd | 11 | compat_urlparse |
0463b77a S |
12 | ) |
13 | from ..utils import ( | |
84a18e9b | 14 | extract_attributes, |
0463b77a S |
15 | int_or_none, |
16 | strip_or_none, | |
17 | unified_timestamp, | |
18 | ) | |
2d185706 JAW |
19 | |
20 | ||
21 | class PolskieRadioIE(InfoExtractor): | |
0463b77a | 22 | _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' |
2d185706 JAW |
23 | _TESTS = [{ |
24 | 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', | |
2d185706 JAW |
25 | 'info_dict': { |
26 | 'id': '1587943', | |
2d185706 JAW |
27 | 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', |
28 | 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', | |
0463b77a S |
29 | }, |
30 | 'playlist': [{ | |
31 | 'md5': '2984ee6ce9046d91fc233bc1a864a09a', | |
32 | 'info_dict': { | |
33 | 'id': '1540576', | |
34 | 'ext': 'mp3', | |
35 | 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', | |
36 | 'timestamp': 1456594200, | |
37 | 'upload_date': '20160227', | |
38 | 'duration': 2364, | |
ec85ded8 | 39 | 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' |
0463b77a S |
40 | }, |
41 | }], | |
2d185706 | 42 | }, { |
0463b77a | 43 | 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', |
2d185706 | 44 | 'info_dict': { |
0463b77a S |
45 | 'id': '1635803', |
46 | 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', | |
47 | 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', | |
48 | }, | |
49 | 'playlist_mincount': 12, | |
50 | }, { | |
51 | 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', | |
52 | 'only_matching': True, | |
53 | }, { | |
54 | 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', | |
55 | 'only_matching': True, | |
ce96ed05 S |
56 | }, { |
57 | # with mp4 video | |
58 | 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', | |
59 | 'only_matching': True, | |
2d185706 JAW |
60 | }] |
61 | ||
62 | def _real_extract(self, url): | |
0463b77a S |
63 | playlist_id = self._match_id(url) |
64 | ||
65 | webpage = self._download_webpage(url, playlist_id) | |
66 | ||
67 | content = self._search_regex( | |
b230fefc | 68 | r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', |
0463b77a S |
69 | webpage, 'content') |
70 | ||
71 | timestamp = unified_timestamp(self._html_search_regex( | |
72 | r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', | |
73 | webpage, 'timestamp', fatal=False)) | |
2d185706 | 74 | |
e2d616dd JAW |
75 | thumbnail_url = self._og_search_thumbnail(webpage) |
76 | ||
0463b77a | 77 | entries = [] |
2d185706 | 78 | |
0463b77a | 79 | media_urls = set() |
2d185706 | 80 | |
0463b77a S |
81 | for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): |
82 | media = self._parse_json(data_media, playlist_id, fatal=False) | |
83 | if not media.get('file') or not media.get('desc'): | |
84 | continue | |
85 | media_url = self._proto_relative_url(media['file'], 'http:') | |
86 | if media_url in media_urls: | |
87 | continue | |
88 | media_urls.add(media_url) | |
89 | entries.append({ | |
90 | 'id': compat_str(media['id']), | |
91 | 'url': media_url, | |
92 | 'title': compat_urllib_parse_unquote(media['desc']), | |
93 | 'duration': int_or_none(media.get('length')), | |
94 | 'vcodec': 'none' if media.get('provider') == 'audio' else None, | |
95 | 'timestamp': timestamp, | |
e2d616dd | 96 | 'thumbnail': thumbnail_url |
0463b77a | 97 | }) |
2d185706 | 98 | |
0463b77a S |
99 | title = self._og_search_title(webpage).strip() |
100 | description = strip_or_none(self._og_search_description(webpage)) | |
2d185706 | 101 | |
0463b77a | 102 | return self.playlist_result(entries, playlist_id, title, description) |
84a18e9b S |
103 | |
104 | ||
105 | class PolskieRadioCategoryIE(InfoExtractor): | |
106 | _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' | |
107 | _TESTS = [{ | |
108 | 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', | |
109 | 'info_dict': { | |
110 | 'id': '5102', | |
111 | 'title': 'HISTORIA ŻYWA', | |
112 | }, | |
113 | 'playlist_mincount': 38, | |
114 | }, { | |
115 | 'url': 'http://www.polskieradio.pl/7/4807', | |
116 | 'info_dict': { | |
117 | 'id': '4807', | |
118 | 'title': 'Vademecum 1050. rocznicy Chrztu Polski' | |
119 | }, | |
120 | 'playlist_mincount': 5 | |
121 | }, { | |
122 | 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', | |
123 | 'only_matching': True | |
124 | }, { | |
125 | 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', | |
126 | 'info_dict': { | |
127 | 'id': '4143', | |
128 | 'title': 'Kierunek Kraków', | |
129 | }, | |
130 | 'playlist_mincount': 61 | |
131 | }, { | |
132 | 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', | |
133 | 'info_dict': { | |
134 | 'id': '214', | |
135 | 'title': 'Muzyka', | |
136 | }, | |
137 | 'playlist_mincount': 61 | |
138 | }, { | |
139 | 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', | |
140 | 'only_matching': True, | |
141 | }, { | |
142 | 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', | |
143 | 'only_matching': True, | |
144 | }] | |
145 | ||
146 | @classmethod | |
147 | def suitable(cls, url): | |
148 | return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) | |
149 | ||
150 | def _entries(self, url, page, category_id): | |
151 | content = page | |
152 | for page_num in itertools.count(2): | |
153 | for a_entry, entry_id in re.findall( | |
154 | r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', | |
155 | content): | |
156 | entry = extract_attributes(a_entry) | |
157 | href = entry.get('href') | |
158 | if not href: | |
159 | continue | |
160 | yield self.url_result( | |
161 | compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), | |
162 | entry_id, entry.get('title')) | |
163 | mobj = re.search( | |
164 | r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', | |
165 | content) | |
166 | if not mobj: | |
167 | break | |
168 | next_url = compat_urlparse.urljoin(url, mobj.group('url')) | |
169 | content = self._download_webpage( | |
170 | next_url, category_id, 'Downloading page %s' % page_num) | |
171 | ||
172 | def _real_extract(self, url): | |
173 | category_id = self._match_id(url) | |
174 | webpage = self._download_webpage(url, category_id) | |
175 | title = self._html_search_regex( | |
176 | r'<title>([^<]+) - [^<]+ - [^<]+</title>', | |
177 | webpage, 'title', fatal=False) | |
178 | return self.playlist_result( | |
179 | self._entries(url, webpage, category_id), | |
180 | category_id, title) |