]>
Commit | Line | Data |
---|---|---|
dcdb292f | 1 | # coding: utf-8 |
0e3ae924 | 2 | from __future__ import unicode_literals |
3 | ||
4 | import re | |
0e3ae924 | 5 | |
6 | from .common import InfoExtractor | |
1a2b377c S |
7 | from ..utils import ( |
8 | determine_ext, | |
6a0f9a24 S |
9 | js_to_json, |
10 | parse_iso8601, | |
1a2b377c S |
11 | parse_filesize, |
12 | ) | |
0e3ae924 | 13 | |
14 | ||
6a0f9a24 S |
15 | class TagesschauPlayerIE(InfoExtractor): |
16 | IE_NAME = 'tagesschau:player' | |
17 | _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html' | |
18 | ||
19 | _TESTS = [{ | |
20 | 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', | |
21 | 'md5': '8d09548d5c15debad38bee3a4d15ca21', | |
22 | 'info_dict': { | |
23 | 'id': '179517', | |
24 | 'ext': 'mp4', | |
25 | 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', | |
ec85ded8 | 26 | 'thumbnail': r're:^https?:.*\.jpg$', |
6a0f9a24 S |
27 | 'formats': 'mincount:6', |
28 | }, | |
29 | }, { | |
30 | 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', | |
31 | 'md5': '76e6eec6ebd40740671cf0a2c88617e5', | |
32 | 'info_dict': { | |
33 | 'id': '29417', | |
34 | 'ext': 'mp3', | |
35 | 'title': 'Trabi - Bye, bye Rennpappe', | |
ec85ded8 | 36 | 'thumbnail': r're:^https?:.*\.jpg$', |
6a0f9a24 S |
37 | 'formats': 'mincount:2', |
38 | }, | |
39 | }, { | |
40 | 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', | |
41 | 'only_matching': True, | |
42 | }] | |
43 | ||
44 | _FORMATS = { | |
45 | 'xs': {'quality': 0}, | |
46 | 's': {'width': 320, 'height': 180, 'quality': 1}, | |
47 | 'm': {'width': 512, 'height': 288, 'quality': 2}, | |
48 | 'l': {'width': 960, 'height': 540, 'quality': 3}, | |
49 | 'xl': {'width': 1280, 'height': 720, 'quality': 4}, | |
50 | 'xxl': {'quality': 5}, | |
51 | } | |
52 | ||
53 | def _extract_via_api(self, kind, video_id): | |
54 | info = self._download_json( | |
55 | 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), | |
56 | video_id) | |
57 | title = info['headline'] | |
58 | formats = [] | |
59 | for media in info['mediadata']: | |
60 | for format_id, format_url in media.items(): | |
61 | if determine_ext(format_url) == 'm3u8': | |
62 | formats.extend(self._extract_m3u8_formats( | |
63 | format_url, video_id, 'mp4', | |
64 | entry_protocol='m3u8_native', m3u8_id='hls')) | |
65 | else: | |
66 | formats.append({ | |
67 | 'url': format_url, | |
68 | 'format_id': format_id, | |
69 | 'vcodec': 'none' if kind == 'audio' else None, | |
70 | }) | |
71 | self._sort_formats(formats) | |
72 | timestamp = parse_iso8601(info.get('date')) | |
73 | return { | |
74 | 'id': video_id, | |
75 | 'title': title, | |
76 | 'timestamp': timestamp, | |
77 | 'formats': formats, | |
78 | } | |
79 | ||
80 | def _real_extract(self, url): | |
81 | mobj = re.match(self._VALID_URL, url) | |
82 | video_id = mobj.group('id') | |
83 | ||
84 | # kind = mobj.group('kind').lower() | |
85 | # if kind == 'video': | |
86 | # return self._extract_via_api(kind, video_id) | |
87 | ||
88 | # JSON api does not provide some audio formats (e.g. ogg) thus | |
89 | # extractiong audio via webpage | |
90 | ||
91 | webpage = self._download_webpage(url, video_id) | |
92 | ||
93 | title = self._og_search_title(webpage).strip() | |
94 | formats = [] | |
95 | ||
96 | for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): | |
97 | media = self._parse_json(js_to_json(media_json), video_id, fatal=False) | |
98 | if not media: | |
99 | continue | |
100 | src = media.get('src') | |
101 | if not src: | |
102 | return | |
103 | quality = media.get('quality') | |
104 | kind = media.get('type', '').split('/')[0] | |
105 | ext = determine_ext(src) | |
106 | f = { | |
107 | 'url': src, | |
108 | 'format_id': '%s_%s' % (quality, ext) if quality else ext, | |
109 | 'ext': ext, | |
110 | 'vcodec': 'none' if kind == 'audio' else None, | |
111 | } | |
112 | f.update(self._FORMATS.get(quality, {})) | |
113 | formats.append(f) | |
114 | ||
115 | self._sort_formats(formats) | |
116 | ||
117 | thumbnail = self._og_search_thumbnail(webpage) | |
118 | ||
119 | return { | |
120 | 'id': video_id, | |
121 | 'title': title, | |
122 | 'thumbnail': thumbnail, | |
123 | 'formats': formats, | |
124 | } | |
125 | ||
126 | ||
0e3ae924 | 127 | class TagesschauIE(InfoExtractor): |
854cc54b | 128 | _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' |
0e3ae924 | 129 | |
130 | _TESTS = [{ | |
c51bc70e | 131 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', |
4c1b2e5c | 132 | 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', |
0e3ae924 | 133 | 'info_dict': { |
854cc54b | 134 | 'id': 'video-102143', |
0e3ae924 | 135 | 'ext': 'mp4', |
c51bc70e | 136 | 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', |
6a0f9a24 | 137 | 'description': '18.07.2015 20:10 Uhr', |
ec85ded8 | 138 | 'thumbnail': r're:^https?:.*\.jpg$', |
0e3ae924 | 139 | }, |
045c4884 PH |
140 | }, { |
141 | 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', | |
142 | 'md5': '3c54c1f6243d279b706bde660ceec633', | |
143 | 'info_dict': { | |
854cc54b | 144 | 'id': 'ts-5727', |
045c4884 | 145 | 'ext': 'mp4', |
045c4884 | 146 | 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', |
6a0f9a24 | 147 | 'description': 'md5:695c01bfd98b7e313c501386327aea59', |
ec85ded8 | 148 | 'thumbnail': r're:^https?:.*\.jpg$', |
6a0f9a24 S |
149 | }, |
150 | }, { | |
151 | # exclusive audio | |
152 | 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', | |
153 | 'md5': '76e6eec6ebd40740671cf0a2c88617e5', | |
154 | 'info_dict': { | |
854cc54b | 155 | 'id': 'audio-29417', |
6a0f9a24 S |
156 | 'ext': 'mp3', |
157 | 'title': 'Trabi - Bye, bye Rennpappe', | |
158 | 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', | |
ec85ded8 | 159 | 'thumbnail': r're:^https?:.*\.jpg$', |
948199de | 160 | }, |
a47b602b | 161 | }, { |
6a0f9a24 S |
162 | # audio in article |
163 | 'url': 'http://www.tagesschau.de/inland/bnd-303.html', | |
164 | 'md5': 'e0916c623e85fc1d2b26b78f299d3958', | |
a47b602b | 165 | 'info_dict': { |
854cc54b | 166 | 'id': 'bnd-303', |
a47b602b | 167 | 'ext': 'mp3', |
6a0f9a24 S |
168 | 'title': 'Viele Baustellen für neuen BND-Chef', |
169 | 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', | |
ec85ded8 | 170 | 'thumbnail': r're:^https?:.*\.jpg$', |
a47b602b | 171 | }, |
4c1b2e5c S |
172 | }, { |
173 | 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', | |
174 | 'info_dict': { | |
854cc54b | 175 | 'id': 'afd-parteitag-135', |
4c1b2e5c S |
176 | 'title': 'Möchtegern-Underdog mit Machtanspruch', |
177 | }, | |
178 | 'playlist_count': 2, | |
e89d7e30 RH |
179 | }, { |
180 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', | |
948199de | 181 | 'only_matching': True, |
e89d7e30 RH |
182 | }, { |
183 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', | |
948199de | 184 | 'only_matching': True, |
e89d7e30 RH |
185 | }, { |
186 | 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', | |
948199de S |
187 | 'only_matching': True, |
188 | }, { | |
189 | 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', | |
190 | 'only_matching': True, | |
e89d7e30 RH |
191 | }, { |
192 | 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', | |
948199de | 193 | 'only_matching': True, |
3c6ae8b5 RH |
194 | }, { |
195 | 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', | |
948199de S |
196 | 'only_matching': True, |
197 | }, { | |
198 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', | |
199 | 'only_matching': True, | |
651ad35c S |
200 | }, { |
201 | 'url': 'http://www.tagesschau.de/100sekunden/index.html', | |
202 | 'only_matching': True, | |
68bb2fef S |
203 | }, { |
204 | # playlist article with collapsing sections | |
205 | 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', | |
206 | 'only_matching': True, | |
4a5b4d34 PH |
207 | }] |
208 | ||
6a0f9a24 S |
209 | @classmethod |
210 | def suitable(cls, url): | |
211 | return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) | |
0e3ae924 | 212 | |
1a2b377c | 213 | def _extract_formats(self, download_text, media_kind): |
4c1b2e5c S |
214 | links = re.finditer( |
215 | r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', | |
216 | download_text) | |
217 | formats = [] | |
218 | for l in links: | |
1a2b377c S |
219 | link_url = l.group('url') |
220 | if not link_url: | |
221 | continue | |
4c1b2e5c | 222 | format_id = self._search_regex( |
1a2b377c S |
223 | r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', |
224 | default=determine_ext(link_url)) | |
4c1b2e5c S |
225 | format = { |
226 | 'format_id': format_id, | |
227 | 'url': l.group('url'), | |
228 | 'format_name': l.group('name'), | |
229 | } | |
1a2b377c S |
230 | title = l.group('title') |
231 | if title: | |
232 | if media_kind.lower() == 'video': | |
233 | m = re.match( | |
234 | r'''(?x) | |
235 | Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; | |
236 | (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; | |
237 | (?P<vbr>[0-9]+)kbps&\#10; | |
238 | Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; | |
239 | Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', | |
240 | title) | |
241 | if m: | |
242 | format.update({ | |
243 | 'format_note': m.group('audio_desc'), | |
244 | 'vcodec': m.group('vcodec'), | |
245 | 'width': int(m.group('width')), | |
246 | 'height': int(m.group('height')), | |
247 | 'abr': int(m.group('abr')), | |
248 | 'vbr': int(m.group('vbr')), | |
249 | 'filesize_approx': parse_filesize(m.group('filesize_approx')), | |
250 | }) | |
251 | else: | |
252 | m = re.match( | |
253 | r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', | |
254 | title) | |
255 | if m: | |
256 | format.update({ | |
257 | 'format_note': '%s, %s' % (m.group('format'), m.group('note')), | |
258 | 'vcodec': 'none', | |
259 | 'abr': int(m.group('abr')), | |
260 | }) | |
4c1b2e5c S |
261 | formats.append(format) |
262 | self._sort_formats(formats) | |
263 | return formats | |
264 | ||
0e3ae924 | 265 | def _real_extract(self, url): |
651ad35c S |
266 | mobj = re.match(self._VALID_URL, url) |
267 | video_id = mobj.group('id') or mobj.group('path') | |
122c2f87 | 268 | display_id = video_id.lstrip('-') |
6a0f9a24 | 269 | |
0e3ae924 | 270 | webpage = self._download_webpage(url, display_id) |
271 | ||
6a0f9a24 S |
272 | title = self._html_search_regex( |
273 | r'<span[^>]*class="headline"[^>]*>(.+?)</span>', | |
274 | webpage, 'title', default=None) or self._og_search_title(webpage) | |
275 | ||
276 | DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' | |
277 | ||
278 | webpage_type = self._og_search_property('type', webpage, default=None) | |
279 | if webpage_type == 'website': # Article | |
280 | entries = [] | |
281 | for num, (entry_title, media_kind, download_text) in enumerate(re.findall( | |
68bb2fef | 282 | r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, |
6a0f9a24 S |
283 | webpage), 1): |
284 | entries.append({ | |
285 | 'id': '%s-%d' % (display_id, num), | |
286 | 'title': '%s' % entry_title, | |
287 | 'formats': self._extract_formats(download_text, media_kind), | |
288 | }) | |
289 | if len(entries) > 1: | |
4c1b2e5c | 290 | return self.playlist_result(entries, display_id, title) |
6a0f9a24 S |
291 | formats = entries[0]['formats'] |
292 | else: # Assume single video | |
293 | download_text = self._search_regex( | |
294 | DOWNLOAD_REGEX, webpage, 'download links', group='links') | |
295 | media_kind = self._search_regex( | |
296 | DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') | |
297 | formats = self._extract_formats(download_text, media_kind) | |
298 | thumbnail = self._og_search_thumbnail(webpage) | |
299 | description = self._html_search_regex( | |
300 | r'(?s)<p class="teasertext">(.*?)</p>', | |
301 | webpage, 'description', default=None) | |
4c1b2e5c | 302 | |
0e3ae924 | 303 | self._sort_formats(formats) |
0e3ae924 | 304 | |
305 | return { | |
306 | 'id': display_id, | |
045c4884 PH |
307 | 'title': title, |
308 | 'thumbnail': thumbnail, | |
0e3ae924 | 309 | 'formats': formats, |
045c4884 | 310 | 'description': description, |
0e3ae924 | 311 | } |