]>
Commit | Line | Data |
---|---|---|
f9b85496 PH |
1 | # coding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
c968f738 | 4 | import json |
d5822b96 PH |
5 | import re |
6 | ||
7 | from .common import InfoExtractor | |
3741302a | 8 | from .generic import GenericIE |
d5822b96 | 9 | from ..utils import ( |
f9b85496 | 10 | determine_ext, |
d5822b96 | 11 | ExtractorError, |
6d3d3fc0 PH |
12 | int_or_none, |
13 | parse_duration, | |
75258218 S |
14 | qualities, |
15 | str_or_none, | |
16 | try_get, | |
6d3d3fc0 | 17 | unified_strdate, |
75258218 | 18 | unified_timestamp, |
31eeab9f | 19 | update_url_query, |
3052a30d | 20 | url_or_none, |
75258218 | 21 | xpath_text, |
d5822b96 | 22 | ) |
f7854627 | 23 | from ..compat import compat_etree_fromstring |
d5822b96 | 24 | |
f9b85496 | 25 | |
c968f738 RA |
26 | class ARDMediathekBaseIE(InfoExtractor): |
27 | _GEO_COUNTRIES = ['DE'] | |
1c821227 | 28 | |
e37c92ec S |
29 | def _extract_media_info(self, media_info_url, webpage, video_id): |
30 | media_info = self._download_json( | |
31 | media_info_url, video_id, 'Downloading media JSON') | |
c968f738 | 32 | return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) |
e37c92ec | 33 | |
c968f738 | 34 | def _parse_media_info(self, media_info, video_id, fsk): |
e37c92ec S |
35 | formats = self._extract_formats(media_info, video_id) |
36 | ||
37 | if not formats: | |
c968f738 | 38 | if fsk: |
e37c92ec S |
39 | raise ExtractorError( |
40 | 'This video is only available after 20:00', expected=True) | |
41 | elif media_info.get('_geoblocked'): | |
c968f738 RA |
42 | self.raise_geo_restricted( |
43 | 'This video is not available due to geoblocking', | |
44 | countries=self._GEO_COUNTRIES) | |
e37c92ec S |
45 | |
46 | self._sort_formats(formats) | |
47 | ||
e37c92ec S |
48 | subtitles = {} |
49 | subtitle_url = media_info.get('_subtitleUrl') | |
50 | if subtitle_url: | |
51 | subtitles['de'] = [{ | |
ffa2cecf | 52 | 'ext': 'ttml', |
e37c92ec S |
53 | 'url': subtitle_url, |
54 | }] | |
55 | ||
56 | return { | |
57 | 'id': video_id, | |
c968f738 RA |
58 | 'duration': int_or_none(media_info.get('_duration')), |
59 | 'thumbnail': media_info.get('_previewImage'), | |
60 | 'is_live': media_info.get('_isLive') is True, | |
e37c92ec S |
61 | 'formats': formats, |
62 | 'subtitles': subtitles, | |
63 | } | |
64 | ||
65 | def _extract_formats(self, media_info, video_id): | |
66 | type_ = media_info.get('_type') | |
67 | media_array = media_info.get('_mediaArray', []) | |
68 | formats = [] | |
69 | for num, media in enumerate(media_array): | |
70 | for stream in media.get('_mediaStreamArray', []): | |
71 | stream_urls = stream.get('_stream') | |
72 | if not stream_urls: | |
73 | continue | |
74 | if not isinstance(stream_urls, list): | |
75 | stream_urls = [stream_urls] | |
76 | quality = stream.get('_quality') | |
77 | server = stream.get('_server') | |
78 | for stream_url in stream_urls: | |
3052a30d | 79 | if not url_or_none(stream_url): |
91328f26 | 80 | continue |
e37c92ec | 81 | ext = determine_ext(stream_url) |
1fc0b47f | 82 | if quality != 'auto' and ext in ('f4m', 'm3u8'): |
83 | continue | |
e37c92ec | 84 | if ext == 'f4m': |
7e5edcfd | 85 | formats.extend(self._extract_f4m_formats( |
31eeab9f RA |
86 | update_url_query(stream_url, { |
87 | 'hdcore': '3.1.1', | |
88 | 'plugin': 'aasp-3.1.1.69.124' | |
c968f738 | 89 | }), video_id, f4m_id='hds', fatal=False)) |
e37c92ec | 90 | elif ext == 'm3u8': |
7e5edcfd | 91 | formats.extend(self._extract_m3u8_formats( |
c968f738 RA |
92 | stream_url, video_id, 'mp4', 'm3u8_native', |
93 | m3u8_id='hls', fatal=False)) | |
e37c92ec S |
94 | else: |
95 | if server and server.startswith('rtmp'): | |
96 | f = { | |
97 | 'url': server, | |
98 | 'play_path': stream_url, | |
99 | 'format_id': 'a%s-rtmp-%s' % (num, quality), | |
100 | } | |
91328f26 | 101 | else: |
e37c92ec S |
102 | f = { |
103 | 'url': stream_url, | |
104 | 'format_id': 'a%s-%s-%s' % (num, ext, quality) | |
105 | } | |
c968f738 RA |
106 | m = re.search( |
107 | r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', | |
108 | stream_url) | |
e37c92ec S |
109 | if m: |
110 | f.update({ | |
111 | 'width': int(m.group('width')), | |
112 | 'height': int(m.group('height')), | |
113 | }) | |
114 | if type_ == 'audio': | |
115 | f['vcodec'] = 'none' | |
116 | formats.append(f) | |
117 | return formats | |
118 | ||
c968f738 RA |
119 | |
120 | class ARDMediathekIE(ARDMediathekBaseIE): | |
121 | IE_NAME = 'ARD:mediathek' | |
122 | _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' | |
123 | ||
124 | _TESTS = [{ | |
125 | # available till 26.07.2022 | |
126 | 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', | |
127 | 'info_dict': { | |
128 | 'id': '44726822', | |
129 | 'ext': 'mp4', | |
130 | 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', | |
131 | 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', | |
132 | 'duration': 1740, | |
133 | }, | |
134 | 'params': { | |
135 | # m3u8 download | |
136 | 'skip_download': True, | |
137 | } | |
138 | }, { | |
139 | 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', | |
140 | 'only_matching': True, | |
141 | }, { | |
142 | # audio | |
143 | 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', | |
144 | 'only_matching': True, | |
145 | }, { | |
146 | 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', | |
147 | 'only_matching': True, | |
148 | }, { | |
149 | # audio | |
150 | 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', | |
151 | 'only_matching': True, | |
152 | }, { | |
153 | 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', | |
154 | 'only_matching': True, | |
155 | }] | |
156 | ||
157 | @classmethod | |
158 | def suitable(cls, url): | |
159 | return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) | |
160 | ||
d5822b96 PH |
161 | def _real_extract(self, url): |
162 | # determine video id from url | |
163 | m = re.match(self._VALID_URL, url) | |
164 | ||
ca127ab2 S |
165 | document_id = None |
166 | ||
d5822b96 PH |
167 | numid = re.search(r'documentId=([0-9]+)', url) |
168 | if numid: | |
ca127ab2 | 169 | document_id = video_id = numid.group(1) |
d5822b96 PH |
170 | else: |
171 | video_id = m.group('video_id') | |
172 | ||
5622f29a | 173 | webpage = self._download_webpage(url, video_id) |
f9b85496 | 174 | |
3791d84a S |
175 | ERRORS = ( |
176 | ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'), | |
177 | ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<', | |
178 | 'Video %s is no longer available'), | |
3791d84a S |
179 | ) |
180 | ||
181 | for pattern, message in ERRORS: | |
182 | if pattern in webpage: | |
183 | raise ExtractorError(message % video_id, expected=True) | |
39aa42ff | 184 | |
bfd91588 | 185 | if re.search(r'[\?&]rss($|[=&])', url): |
f7854627 | 186 | doc = compat_etree_fromstring(webpage.encode('utf-8')) |
3741302a OE |
187 | if doc.tag == 'rss': |
188 | return GenericIE()._extract_rss(url, video_id, doc) | |
189 | ||
f9b85496 | 190 | title = self._html_search_regex( |
0f97c9a0 | 191 | [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', |
197224b7 | 192 | r'<meta name="dcterms\.title" content="(.*?)"/>', |
8c587971 AS |
193 | r'<h4 class="headline">(.*?)</h4>', |
194 | r'<title[^>]*>(.*?)</title>'], | |
0f97c9a0 | 195 | webpage, 'title') |
f9b85496 | 196 | description = self._html_search_meta( |
29546b34 PH |
197 | 'dcterms.abstract', webpage, 'description', default=None) |
198 | if description is None: | |
199 | description = self._html_search_meta( | |
8c587971 AS |
200 | 'description', webpage, 'meta description', default=None) |
201 | if description is None: | |
202 | description = self._html_search_regex( | |
203 | r'<p\s+class="teasertext">(.+?)</p>', | |
204 | webpage, 'teaser text', default=None) | |
29546b34 PH |
205 | |
206 | # Thumbnail is sometimes not present. | |
207 | # It is in the mobile version, but that seems to use a different URL | |
208 | # structure altogether. | |
209 | thumbnail = self._og_search_thumbnail(webpage, default=None) | |
210 | ||
211 | media_streams = re.findall(r'''(?x) | |
212 | mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* | |
213 | "([^"]+)"''', webpage) | |
214 | ||
215 | if media_streams: | |
216 | QUALITIES = qualities(['lo', 'hi', 'hq']) | |
217 | formats = [] | |
218 | for furl in set(media_streams): | |
219 | if furl.endswith('.f4m'): | |
220 | fid = 'f4m' | |
221 | else: | |
222 | fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) | |
223 | fid = fid_m.group(1) if fid_m else None | |
224 | formats.append({ | |
225 | 'quality': QUALITIES(fid), | |
226 | 'format_id': fid, | |
227 | 'url': furl, | |
228 | }) | |
e37c92ec S |
229 | self._sort_formats(formats) |
230 | info = { | |
231 | 'formats': formats, | |
232 | } | |
29546b34 | 233 | else: # request JSON file |
ca127ab2 S |
234 | if not document_id: |
235 | video_id = self._search_regex( | |
236 | r'/play/(?:config|media)/(\d+)', webpage, 'media id') | |
e37c92ec | 237 | info = self._extract_media_info( |
ca127ab2 S |
238 | 'http://www.ardmediathek.de/play/media/%s' % video_id, |
239 | webpage, video_id) | |
f9b85496 | 240 | |
e37c92ec | 241 | info.update({ |
f9b85496 | 242 | 'id': video_id, |
ca127ab2 | 243 | 'title': self._live_title(title) if info.get('is_live') else title, |
f9b85496 | 244 | 'description': description, |
f9b85496 | 245 | 'thumbnail': thumbnail, |
e37c92ec S |
246 | }) |
247 | ||
248 | return info | |
6d3d3fc0 PH |
249 | |
250 | ||
251 | class ARDIE(InfoExtractor): | |
a54c5f83 | 252 | _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html' |
ad29ef04 | 253 | _TESTS = [{ |
d6a03502 | 254 | # available till 14.02.2019 |
ad29ef04 W |
255 | 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', |
256 | 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', | |
257 | 'info_dict': { | |
258 | 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', | |
259 | 'id': '102', | |
260 | 'ext': 'mp4', | |
261 | 'duration': 4435.0, | |
262 | 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', | |
263 | 'upload_date': '20180214', | |
264 | 'thumbnail': r're:^https?://.*\.jpg$', | |
265 | }, | |
a54c5f83 MK |
266 | }, { |
267 | 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', | |
268 | 'only_matching': True, | |
d6a03502 | 269 | }, { |
6d3d3fc0 | 270 | 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', |
d6a03502 | 271 | 'only_matching': True, |
ad29ef04 | 272 | }] |
6d3d3fc0 PH |
273 | |
274 | def _real_extract(self, url): | |
275 | mobj = re.match(self._VALID_URL, url) | |
276 | display_id = mobj.group('display_id') | |
277 | ||
278 | player_url = mobj.group('mainurl') + '~playerXml.xml' | |
279 | doc = self._download_xml(player_url, display_id) | |
280 | video_node = doc.find('./video') | |
bf0ff932 PH |
281 | upload_date = unified_strdate(xpath_text( |
282 | video_node, './broadcastDate')) | |
283 | thumbnail = xpath_text(video_node, './/teaserImage//variant/url') | |
6d3d3fc0 PH |
284 | |
285 | formats = [] | |
286 | for a in video_node.findall('.//asset'): | |
287 | f = { | |
288 | 'format_id': a.attrib['type'], | |
289 | 'width': int_or_none(a.find('./frameWidth').text), | |
290 | 'height': int_or_none(a.find('./frameHeight').text), | |
291 | 'vbr': int_or_none(a.find('./bitrateVideo').text), | |
292 | 'abr': int_or_none(a.find('./bitrateAudio').text), | |
293 | 'vcodec': a.find('./codecVideo').text, | |
294 | 'tbr': int_or_none(a.find('./totalBitrate').text), | |
295 | } | |
296 | if a.find('./serverPrefix').text: | |
297 | f['url'] = a.find('./serverPrefix').text | |
298 | f['playpath'] = a.find('./fileName').text | |
299 | else: | |
300 | f['url'] = a.find('./fileName').text | |
301 | formats.append(f) | |
302 | self._sort_formats(formats) | |
303 | ||
304 | return { | |
305 | 'id': mobj.group('id'), | |
306 | 'formats': formats, | |
307 | 'display_id': display_id, | |
308 | 'title': video_node.find('./title').text, | |
309 | 'duration': parse_duration(video_node.find('./duration').text), | |
310 | 'upload_date': upload_date, | |
311 | 'thumbnail': thumbnail, | |
312 | } | |
c1a37eb2 PH |
313 | |
314 | ||
c968f738 | 315 | class ARDBetaMediathekIE(ARDMediathekBaseIE): |
fe515e5c | 316 | _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)' |
c1a37eb2 | 317 | _TESTS = [{ |
fe515e5c | 318 | 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', |
c968f738 | 319 | 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', |
c1a37eb2 PH |
320 | 'info_dict': { |
321 | 'display_id': 'die-robuste-roswita', | |
c968f738 RA |
322 | 'id': '70153354', |
323 | 'title': 'Die robuste Roswita', | |
c1a37eb2 PH |
324 | 'description': r're:^Der Mord.*trüber ist als die Ilm.', |
325 | 'duration': 5316, | |
c968f738 RA |
326 | 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', |
327 | 'timestamp': 1577047500, | |
328 | 'upload_date': '20191222', | |
c1a37eb2 PH |
329 | 'ext': 'mp4', |
330 | }, | |
fe515e5c S |
331 | }, { |
332 | 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', | |
333 | 'only_matching': True, | |
334 | }, { | |
335 | 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', | |
336 | 'only_matching': True, | |
337 | }, { | |
338 | 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', | |
339 | 'only_matching': True, | |
1c821227 S |
340 | }, { |
341 | 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', | |
342 | 'only_matching': True, | |
343 | }, { | |
344 | 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', | |
345 | 'only_matching': True, | |
c1a37eb2 PH |
346 | }] |
347 | ||
348 | def _real_extract(self, url): | |
349 | mobj = re.match(self._VALID_URL, url) | |
350 | video_id = mobj.group('video_id') | |
fe515e5c S |
351 | display_id = mobj.group('display_id') |
352 | if display_id: | |
353 | display_id = display_id.rstrip('/') | |
354 | if not display_id: | |
355 | display_id = video_id | |
c1a37eb2 | 356 | |
c968f738 RA |
357 | player_page = self._download_json( |
358 | 'https://api.ardmediathek.de/public-gateway', | |
359 | display_id, data=json.dumps({ | |
360 | 'query': '''{ | |
361 | playerPage(client:"%s", clipId: "%s") { | |
362 | blockedByFsk | |
363 | broadcastedOn | |
364 | maturityContentRating | |
365 | mediaCollection { | |
366 | _duration | |
367 | _geoblocked | |
368 | _isLive | |
369 | _mediaArray { | |
370 | _mediaStreamArray { | |
371 | _quality | |
372 | _server | |
373 | _stream | |
c1a37eb2 | 374 | } |
c968f738 RA |
375 | } |
376 | _previewImage | |
377 | _subtitleUrl | |
378 | _type | |
379 | } | |
380 | show { | |
381 | title | |
382 | } | |
383 | synopsis | |
384 | title | |
385 | tracking { | |
386 | atiCustomVars { | |
387 | contentId | |
388 | } | |
389 | } | |
390 | } | |
391 | }''' % (mobj.group('client'), video_id), | |
392 | }).encode(), headers={ | |
393 | 'Content-Type': 'application/json' | |
394 | })['data']['playerPage'] | |
395 | title = player_page['title'] | |
396 | content_id = str_or_none(try_get( | |
397 | player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) | |
398 | media_collection = player_page.get('mediaCollection') or {} | |
399 | if not media_collection and content_id: | |
400 | media_collection = self._download_json( | |
401 | 'https://www.ardmediathek.de/play/media/' + content_id, | |
402 | content_id, fatal=False) or {} | |
403 | info = self._parse_media_info( | |
404 | media_collection, content_id or video_id, | |
405 | player_page.get('blockedByFsk')) | |
406 | age_limit = None | |
407 | description = player_page.get('synopsis') | |
408 | maturity_content_rating = player_page.get('maturityContentRating') | |
409 | if maturity_content_rating: | |
410 | age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) | |
f4a18db7 RA |
411 | if not age_limit and description: |
412 | age_limit = int_or_none(self._search_regex( | |
413 | r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) | |
c968f738 RA |
414 | info.update({ |
415 | 'age_limit': age_limit, | |
416 | 'display_id': display_id, | |
417 | 'title': title, | |
418 | 'description': description, | |
419 | 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), | |
420 | 'series': try_get(player_page, lambda x: x['show']['title']), | |
75258218 | 421 | }) |
c968f738 | 422 | return info |