]>
Commit | Line | Data |
---|---|---|
f9b85496 PH |
1 | # coding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
d5822b96 PH |
4 | import re |
5 | ||
6 | from .common import InfoExtractor | |
3741302a | 7 | from .generic import GenericIE |
91328f26 | 8 | from ..compat import compat_str |
d5822b96 | 9 | from ..utils import ( |
f9b85496 | 10 | determine_ext, |
d5822b96 | 11 | ExtractorError, |
29546b34 | 12 | qualities, |
6d3d3fc0 PH |
13 | int_or_none, |
14 | parse_duration, | |
15 | unified_strdate, | |
bf0ff932 | 16 | xpath_text, |
31eeab9f | 17 | update_url_query, |
d5822b96 | 18 | ) |
f7854627 | 19 | from ..compat import compat_etree_fromstring |
d5822b96 | 20 | |
f9b85496 | 21 | |
6d3d3fc0 PH |
22 | class ARDMediathekIE(InfoExtractor): |
23 | IE_NAME = 'ARD:mediathek' | |
af21f56f | 24 | _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' |
f9b85496 | 25 | |
29546b34 | 26 | _TESTS = [{ |
ad29ef04 W |
27 | 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', |
28 | 'info_dict': { | |
29 | 'id': '44726822', | |
30 | 'ext': 'mp4', | |
31 | 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', | |
32 | 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', | |
33 | 'duration': 1740, | |
34 | }, | |
35 | 'params': { | |
36 | # m3u8 download | |
37 | 'skip_download': True, | |
38 | } | |
39 | }, | |
40 | { | |
769efa16 | 41 | 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', |
29546b34 | 42 | 'info_dict': { |
769efa16 | 43 | 'id': '29582122', |
29546b34 | 44 | 'ext': 'mp4', |
769efa16 S |
45 | 'title': 'Ich liebe das Leben trotzdem', |
46 | 'description': 'md5:45e4c225c72b27993314b31a84a5261c', | |
47 | 'duration': 4557, | |
48 | }, | |
49 | 'params': { | |
50 | # m3u8 download | |
51 | 'skip_download': True, | |
29546b34 | 52 | }, |
31eeab9f | 53 | 'skip': 'HTTP Error 404: Not Found', |
d719c6a5 S |
54 | }, { |
55 | 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', | |
56 | 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', | |
57 | 'info_dict': { | |
58 | 'id': '29522730', | |
59 | 'ext': 'mp4', | |
60 | 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', | |
61 | 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', | |
62 | 'duration': 5252, | |
63 | }, | |
31eeab9f | 64 | 'skip': 'HTTP Error 404: Not Found', |
86b4e98a S |
65 | }, { |
66 | # audio | |
67 | 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', | |
68 | 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', | |
69 | 'info_dict': { | |
70 | 'id': '28488308', | |
71 | 'ext': 'mp3', | |
72 | 'title': 'Tod eines Fußballers', | |
73 | 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', | |
74 | 'duration': 3240, | |
75 | }, | |
31eeab9f | 76 | 'skip': 'HTTP Error 404: Not Found', |
769efa16 S |
77 | }, { |
78 | 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', | |
79 | 'only_matching': True, | |
a66a73ee S |
80 | }, { |
81 | # audio | |
82 | 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', | |
83 | 'md5': '4e8f00631aac0395fee17368ac0e9867', | |
84 | 'info_dict': { | |
85 | 'id': '30796318', | |
86 | 'ext': 'mp3', | |
87 | 'title': 'Vor dem Fest', | |
88 | 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', | |
89 | 'duration': 3287, | |
90 | }, | |
39eef54c | 91 | 'skip': 'Video is no longer available', |
29546b34 | 92 | }] |
d5822b96 | 93 | |
e37c92ec S |
94 | def _extract_media_info(self, media_info_url, webpage, video_id): |
95 | media_info = self._download_json( | |
96 | media_info_url, video_id, 'Downloading media JSON') | |
97 | ||
98 | formats = self._extract_formats(media_info, video_id) | |
99 | ||
100 | if not formats: | |
101 | if '"fsk"' in webpage: | |
102 | raise ExtractorError( | |
103 | 'This video is only available after 20:00', expected=True) | |
104 | elif media_info.get('_geoblocked'): | |
105 | raise ExtractorError('This video is not available due to geo restriction', expected=True) | |
106 | ||
107 | self._sort_formats(formats) | |
108 | ||
109 | duration = int_or_none(media_info.get('_duration')) | |
110 | thumbnail = media_info.get('_previewImage') | |
ca127ab2 | 111 | is_live = media_info.get('_isLive') is True |
e37c92ec S |
112 | |
113 | subtitles = {} | |
114 | subtitle_url = media_info.get('_subtitleUrl') | |
115 | if subtitle_url: | |
116 | subtitles['de'] = [{ | |
ffa2cecf | 117 | 'ext': 'ttml', |
e37c92ec S |
118 | 'url': subtitle_url, |
119 | }] | |
120 | ||
121 | return { | |
122 | 'id': video_id, | |
123 | 'duration': duration, | |
124 | 'thumbnail': thumbnail, | |
ca127ab2 | 125 | 'is_live': is_live, |
e37c92ec S |
126 | 'formats': formats, |
127 | 'subtitles': subtitles, | |
128 | } | |
129 | ||
130 | def _extract_formats(self, media_info, video_id): | |
131 | type_ = media_info.get('_type') | |
132 | media_array = media_info.get('_mediaArray', []) | |
133 | formats = [] | |
134 | for num, media in enumerate(media_array): | |
135 | for stream in media.get('_mediaStreamArray', []): | |
136 | stream_urls = stream.get('_stream') | |
137 | if not stream_urls: | |
138 | continue | |
139 | if not isinstance(stream_urls, list): | |
140 | stream_urls = [stream_urls] | |
141 | quality = stream.get('_quality') | |
142 | server = stream.get('_server') | |
143 | for stream_url in stream_urls: | |
91328f26 S |
144 | if not isinstance(stream_url, compat_str) or '//' not in stream_url: |
145 | continue | |
e37c92ec | 146 | ext = determine_ext(stream_url) |
1fc0b47f | 147 | if quality != 'auto' and ext in ('f4m', 'm3u8'): |
148 | continue | |
e37c92ec | 149 | if ext == 'f4m': |
7e5edcfd | 150 | formats.extend(self._extract_f4m_formats( |
31eeab9f RA |
151 | update_url_query(stream_url, { |
152 | 'hdcore': '3.1.1', | |
153 | 'plugin': 'aasp-3.1.1.69.124' | |
154 | }), | |
155 | video_id, f4m_id='hds', fatal=False)) | |
e37c92ec | 156 | elif ext == 'm3u8': |
7e5edcfd | 157 | formats.extend(self._extract_m3u8_formats( |
31eeab9f | 158 | stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) |
e37c92ec S |
159 | else: |
160 | if server and server.startswith('rtmp'): | |
161 | f = { | |
162 | 'url': server, | |
163 | 'play_path': stream_url, | |
164 | 'format_id': 'a%s-rtmp-%s' % (num, quality), | |
165 | } | |
91328f26 | 166 | else: |
e37c92ec S |
167 | f = { |
168 | 'url': stream_url, | |
169 | 'format_id': 'a%s-%s-%s' % (num, ext, quality) | |
170 | } | |
e37c92ec S |
171 | m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url) |
172 | if m: | |
173 | f.update({ | |
174 | 'width': int(m.group('width')), | |
175 | 'height': int(m.group('height')), | |
176 | }) | |
177 | if type_ == 'audio': | |
178 | f['vcodec'] = 'none' | |
179 | formats.append(f) | |
180 | return formats | |
181 | ||
d5822b96 PH |
182 | def _real_extract(self, url): |
183 | # determine video id from url | |
184 | m = re.match(self._VALID_URL, url) | |
185 | ||
ca127ab2 S |
186 | document_id = None |
187 | ||
d5822b96 PH |
188 | numid = re.search(r'documentId=([0-9]+)', url) |
189 | if numid: | |
ca127ab2 | 190 | document_id = video_id = numid.group(1) |
d5822b96 PH |
191 | else: |
192 | video_id = m.group('video_id') | |
193 | ||
5622f29a | 194 | webpage = self._download_webpage(url, video_id) |
f9b85496 | 195 | |
3791d84a S |
196 | ERRORS = ( |
197 | ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'), | |
198 | ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<', | |
199 | 'Video %s is no longer available'), | |
3791d84a S |
200 | ) |
201 | ||
202 | for pattern, message in ERRORS: | |
203 | if pattern in webpage: | |
204 | raise ExtractorError(message % video_id, expected=True) | |
39aa42ff | 205 | |
bfd91588 | 206 | if re.search(r'[\?&]rss($|[=&])', url): |
f7854627 | 207 | doc = compat_etree_fromstring(webpage.encode('utf-8')) |
3741302a OE |
208 | if doc.tag == 'rss': |
209 | return GenericIE()._extract_rss(url, video_id, doc) | |
210 | ||
f9b85496 | 211 | title = self._html_search_regex( |
0f97c9a0 | 212 | [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', |
197224b7 | 213 | r'<meta name="dcterms\.title" content="(.*?)"/>', |
0f97c9a0 PH |
214 | r'<h4 class="headline">(.*?)</h4>'], |
215 | webpage, 'title') | |
f9b85496 | 216 | description = self._html_search_meta( |
29546b34 PH |
217 | 'dcterms.abstract', webpage, 'description', default=None) |
218 | if description is None: | |
219 | description = self._html_search_meta( | |
220 | 'description', webpage, 'meta description') | |
221 | ||
222 | # Thumbnail is sometimes not present. | |
223 | # It is in the mobile version, but that seems to use a different URL | |
224 | # structure altogether. | |
225 | thumbnail = self._og_search_thumbnail(webpage, default=None) | |
226 | ||
227 | media_streams = re.findall(r'''(?x) | |
228 | mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* | |
229 | "([^"]+)"''', webpage) | |
230 | ||
231 | if media_streams: | |
232 | QUALITIES = qualities(['lo', 'hi', 'hq']) | |
233 | formats = [] | |
234 | for furl in set(media_streams): | |
235 | if furl.endswith('.f4m'): | |
236 | fid = 'f4m' | |
237 | else: | |
238 | fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) | |
239 | fid = fid_m.group(1) if fid_m else None | |
240 | formats.append({ | |
241 | 'quality': QUALITIES(fid), | |
242 | 'format_id': fid, | |
243 | 'url': furl, | |
244 | }) | |
e37c92ec S |
245 | self._sort_formats(formats) |
246 | info = { | |
247 | 'formats': formats, | |
248 | } | |
29546b34 | 249 | else: # request JSON file |
ca127ab2 S |
250 | if not document_id: |
251 | video_id = self._search_regex( | |
252 | r'/play/(?:config|media)/(\d+)', webpage, 'media id') | |
e37c92ec | 253 | info = self._extract_media_info( |
ca127ab2 S |
254 | 'http://www.ardmediathek.de/play/media/%s' % video_id, |
255 | webpage, video_id) | |
f9b85496 | 256 | |
e37c92ec | 257 | info.update({ |
f9b85496 | 258 | 'id': video_id, |
ca127ab2 | 259 | 'title': self._live_title(title) if info.get('is_live') else title, |
f9b85496 | 260 | 'description': description, |
f9b85496 | 261 | 'thumbnail': thumbnail, |
e37c92ec S |
262 | }) |
263 | ||
264 | return info | |
6d3d3fc0 PH |
265 | |
266 | ||
267 | class ARDIE(InfoExtractor): | |
25042f73 | 268 | _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html' |
ad29ef04 W |
269 | _TESTS = [{ |
270 | 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', | |
271 | 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', | |
272 | 'info_dict': { | |
273 | 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', | |
274 | 'id': '102', | |
275 | 'ext': 'mp4', | |
276 | 'duration': 4435.0, | |
277 | 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', | |
278 | 'upload_date': '20180214', | |
279 | 'thumbnail': r're:^https?://.*\.jpg$', | |
280 | }, | |
281 | }, | |
282 | { | |
6d3d3fc0 PH |
283 | 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', |
284 | 'md5': 'd216c3a86493f9322545e045ddc3eb35', | |
285 | 'info_dict': { | |
286 | 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge', | |
287 | 'id': '100', | |
288 | 'ext': 'mp4', | |
289 | 'duration': 2600, | |
290 | 'title': 'Die Story im Ersten: Mission unter falscher Flagge', | |
291 | 'upload_date': '20140804', | |
ec85ded8 | 292 | 'thumbnail': r're:^https?://.*\.jpg$', |
31eeab9f RA |
293 | }, |
294 | 'skip': 'HTTP Error 404: Not Found', | |
ad29ef04 | 295 | }] |
6d3d3fc0 PH |
296 | |
297 | def _real_extract(self, url): | |
298 | mobj = re.match(self._VALID_URL, url) | |
299 | display_id = mobj.group('display_id') | |
300 | ||
301 | player_url = mobj.group('mainurl') + '~playerXml.xml' | |
302 | doc = self._download_xml(player_url, display_id) | |
303 | video_node = doc.find('./video') | |
bf0ff932 PH |
304 | upload_date = unified_strdate(xpath_text( |
305 | video_node, './broadcastDate')) | |
306 | thumbnail = xpath_text(video_node, './/teaserImage//variant/url') | |
6d3d3fc0 PH |
307 | |
308 | formats = [] | |
309 | for a in video_node.findall('.//asset'): | |
310 | f = { | |
311 | 'format_id': a.attrib['type'], | |
312 | 'width': int_or_none(a.find('./frameWidth').text), | |
313 | 'height': int_or_none(a.find('./frameHeight').text), | |
314 | 'vbr': int_or_none(a.find('./bitrateVideo').text), | |
315 | 'abr': int_or_none(a.find('./bitrateAudio').text), | |
316 | 'vcodec': a.find('./codecVideo').text, | |
317 | 'tbr': int_or_none(a.find('./totalBitrate').text), | |
318 | } | |
319 | if a.find('./serverPrefix').text: | |
320 | f['url'] = a.find('./serverPrefix').text | |
321 | f['playpath'] = a.find('./fileName').text | |
322 | else: | |
323 | f['url'] = a.find('./fileName').text | |
324 | formats.append(f) | |
325 | self._sort_formats(formats) | |
326 | ||
327 | return { | |
328 | 'id': mobj.group('id'), | |
329 | 'formats': formats, | |
330 | 'display_id': display_id, | |
331 | 'title': video_node.find('./title').text, | |
332 | 'duration': parse_duration(video_node.find('./duration').text), | |
333 | 'upload_date': upload_date, | |
334 | 'thumbnail': thumbnail, | |
335 | } |