]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/ard.py
8ac926c917fbaac4b5c7aa00a2386d72304193f0
[yt-dlp.git] / yt_dlp / extractor / ard.py
1 import json
2 import re
3
4 from .common import InfoExtractor
5 from .generic import GenericIE
6 from ..utils import (
7 determine_ext,
8 ExtractorError,
9 int_or_none,
10 parse_duration,
11 qualities,
12 str_or_none,
13 try_get,
14 unified_strdate,
15 unified_timestamp,
16 update_url,
17 update_url_query,
18 url_or_none,
19 xpath_text,
20 )
21 from ..compat import compat_etree_fromstring
22
23
24 class ARDMediathekBaseIE(InfoExtractor):
25 _GEO_COUNTRIES = ['DE']
26
27 def _extract_media_info(self, media_info_url, webpage, video_id):
28 media_info = self._download_json(
29 media_info_url, video_id, 'Downloading media JSON')
30 return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
31
32 def _parse_media_info(self, media_info, video_id, fsk):
33 formats = self._extract_formats(media_info, video_id)
34
35 if not formats:
36 if fsk:
37 self.raise_no_formats(
38 'This video is only available after 20:00', expected=True)
39 elif media_info.get('_geoblocked'):
40 self.raise_geo_restricted(
41 'This video is not available due to geoblocking',
42 countries=self._GEO_COUNTRIES, metadata_available=True)
43
44 subtitles = {}
45 subtitle_url = media_info.get('_subtitleUrl')
46 if subtitle_url:
47 subtitles['de'] = [{
48 'ext': 'ttml',
49 'url': subtitle_url,
50 }, {
51 'ext': 'vtt',
52 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
53 }]
54
55 return {
56 'id': video_id,
57 'duration': int_or_none(media_info.get('_duration')),
58 'thumbnail': media_info.get('_previewImage'),
59 'is_live': media_info.get('_isLive') is True,
60 'formats': formats,
61 'subtitles': subtitles,
62 }
63
64 def _ARD_extract_episode_info(self, title):
65 """Try to extract season/episode data from the title."""
66 res = {}
67 if not title:
68 return res
69
70 for pattern in [
71 # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
72 # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
73 r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
74 # E.g.: title="Fritjof aus Norwegen (2) (AD)"
75 # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
76 r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
77 r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
78 # E.g.: title="Folge 25/42: Symmetrie"
79 # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
80 # E.g.: title="Folge 1063 - Vertrauen"
81 # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
82 r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
83 ]:
84 m = re.match(pattern, title)
85 if m:
86 groupdict = m.groupdict()
87 res['season_number'] = int_or_none(groupdict.get('season_number'))
88 res['episode_number'] = int_or_none(groupdict.get('episode_number'))
89 res['episode'] = str_or_none(groupdict.get('episode'))
90 # Build the episode title by removing numeric episode information:
91 if groupdict.get('ep_info') and not res['episode']:
92 res['episode'] = str_or_none(
93 title.replace(groupdict.get('ep_info'), ''))
94 if res['episode']:
95 res['episode'] = res['episode'].strip()
96 break
97
98 # As a fallback use the whole title as the episode name:
99 if not res.get('episode'):
100 res['episode'] = title.strip()
101 return res
102
103 def _extract_formats(self, media_info, video_id):
104 type_ = media_info.get('_type')
105 media_array = media_info.get('_mediaArray', [])
106 formats = []
107 for num, media in enumerate(media_array):
108 for stream in media.get('_mediaStreamArray', []):
109 stream_urls = stream.get('_stream')
110 if not stream_urls:
111 continue
112 if not isinstance(stream_urls, list):
113 stream_urls = [stream_urls]
114 quality = stream.get('_quality')
115 server = stream.get('_server')
116 for stream_url in stream_urls:
117 if not url_or_none(stream_url):
118 continue
119 ext = determine_ext(stream_url)
120 if quality != 'auto' and ext in ('f4m', 'm3u8'):
121 continue
122 if ext == 'f4m':
123 formats.extend(self._extract_f4m_formats(
124 update_url_query(stream_url, {
125 'hdcore': '3.1.1',
126 'plugin': 'aasp-3.1.1.69.124'
127 }), video_id, f4m_id='hds', fatal=False))
128 elif ext == 'm3u8':
129 formats.extend(self._extract_m3u8_formats(
130 stream_url, video_id, 'mp4', 'm3u8_native',
131 m3u8_id='hls', fatal=False))
132 else:
133 if server and server.startswith('rtmp'):
134 f = {
135 'url': server,
136 'play_path': stream_url,
137 'format_id': 'a%s-rtmp-%s' % (num, quality),
138 }
139 else:
140 f = {
141 'url': stream_url,
142 'format_id': 'a%s-%s-%s' % (num, ext, quality)
143 }
144 m = re.search(
145 r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
146 stream_url)
147 if m:
148 f.update({
149 'width': int(m.group('width')),
150 'height': int(m.group('height')),
151 })
152 if type_ == 'audio':
153 f['vcodec'] = 'none'
154 formats.append(f)
155 return formats
156
157
158 class ARDMediathekIE(ARDMediathekBaseIE):
159 IE_NAME = 'ARD:mediathek'
160 _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
161
162 _TESTS = [{
163 # available till 26.07.2022
164 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
165 'info_dict': {
166 'id': '44726822',
167 'ext': 'mp4',
168 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
169 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
170 'duration': 1740,
171 },
172 'params': {
173 # m3u8 download
174 'skip_download': True,
175 }
176 }, {
177 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
178 'only_matching': True,
179 }, {
180 # audio
181 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
182 'only_matching': True,
183 }, {
184 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
185 'only_matching': True,
186 }, {
187 # audio
188 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
189 'only_matching': True,
190 }, {
191 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
192 'only_matching': True,
193 }]
194
195 @classmethod
196 def suitable(cls, url):
197 return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
198
199 def _real_extract(self, url):
200 # determine video id from url
201 m = self._match_valid_url(url)
202
203 document_id = None
204
205 numid = re.search(r'documentId=([0-9]+)', url)
206 if numid:
207 document_id = video_id = numid.group(1)
208 else:
209 video_id = m.group('video_id')
210
211 webpage = self._download_webpage(url, video_id)
212
213 ERRORS = (
214 ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
215 ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
216 'Video %s is no longer available'),
217 )
218
219 for pattern, message in ERRORS:
220 if pattern in webpage:
221 raise ExtractorError(message % video_id, expected=True)
222
223 if re.search(r'[\?&]rss($|[=&])', url):
224 doc = compat_etree_fromstring(webpage.encode('utf-8'))
225 if doc.tag == 'rss':
226 return GenericIE()._extract_rss(url, video_id, doc)
227
228 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
229 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
230 r'<meta name="dcterms\.title" content="(.*?)"/>',
231 r'<h4 class="headline">(.*?)</h4>',
232 r'<title[^>]*>(.*?)</title>'],
233 webpage, 'title')
234 description = self._og_search_description(webpage, default=None) or self._html_search_meta(
235 'dcterms.abstract', webpage, 'description', default=None)
236 if description is None:
237 description = self._html_search_meta(
238 'description', webpage, 'meta description', default=None)
239 if description is None:
240 description = self._html_search_regex(
241 r'<p\s+class="teasertext">(.+?)</p>',
242 webpage, 'teaser text', default=None)
243
244 # Thumbnail is sometimes not present.
245 # It is in the mobile version, but that seems to use a different URL
246 # structure altogether.
247 thumbnail = self._og_search_thumbnail(webpage, default=None)
248
249 media_streams = re.findall(r'''(?x)
250 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
251 "([^"]+)"''', webpage)
252
253 if media_streams:
254 QUALITIES = qualities(['lo', 'hi', 'hq'])
255 formats = []
256 for furl in set(media_streams):
257 if furl.endswith('.f4m'):
258 fid = 'f4m'
259 else:
260 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
261 fid = fid_m.group(1) if fid_m else None
262 formats.append({
263 'quality': QUALITIES(fid),
264 'format_id': fid,
265 'url': furl,
266 })
267 info = {
268 'formats': formats,
269 }
270 else: # request JSON file
271 if not document_id:
272 video_id = self._search_regex(
273 (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
274 webpage, 'media id', default=None)
275 info = self._extract_media_info(
276 'http://www.ardmediathek.de/play/media/%s' % video_id,
277 webpage, video_id)
278
279 info.update({
280 'id': video_id,
281 'title': title,
282 'description': description,
283 'thumbnail': thumbnail,
284 })
285 info.update(self._ARD_extract_episode_info(info['title']))
286
287 return info
288
289
290 class ARDIE(InfoExtractor):
291 _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
292 _TESTS = [{
293 # available till 7.12.2023
294 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
295 'md5': '94812e6438488fb923c361a44469614b',
296 'info_dict': {
297 'id': 'maischberger-video-424',
298 'display_id': 'maischberger-video-424',
299 'ext': 'mp4',
300 'duration': 4452.0,
301 'title': 'maischberger am 07.12.2022',
302 'upload_date': '20221207',
303 'thumbnail': r're:^https?://.*\.jpg$',
304 },
305 }, {
306 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
307 'only_matching': True,
308 }, {
309 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
310 'only_matching': True,
311 }, {
312 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
313 'only_matching': True,
314 }, {
315 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
316 'only_matching': True,
317 }, {
318 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
319 'only_matching': True,
320 }, {
321 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
322 'only_matching': True,
323 }]
324
325 def _real_extract(self, url):
326 mobj = self._match_valid_url(url)
327 display_id = mobj.group('id')
328
329 player_url = mobj.group('mainurl') + '~playerXml.xml'
330 doc = self._download_xml(player_url, display_id)
331 video_node = doc.find('./video')
332 upload_date = unified_strdate(xpath_text(
333 video_node, './broadcastDate'))
334 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
335
336 formats = []
337 for a in video_node.findall('.//asset'):
338 file_name = xpath_text(a, './fileName', default=None)
339 if not file_name:
340 continue
341 format_type = a.attrib.get('type')
342 format_url = url_or_none(file_name)
343 if format_url:
344 ext = determine_ext(file_name)
345 if ext == 'm3u8':
346 formats.extend(self._extract_m3u8_formats(
347 format_url, display_id, 'mp4', entry_protocol='m3u8_native',
348 m3u8_id=format_type or 'hls', fatal=False))
349 continue
350 elif ext == 'f4m':
351 formats.extend(self._extract_f4m_formats(
352 update_url_query(format_url, {'hdcore': '3.7.0'}),
353 display_id, f4m_id=format_type or 'hds', fatal=False))
354 continue
355 f = {
356 'format_id': format_type,
357 'width': int_or_none(xpath_text(a, './frameWidth')),
358 'height': int_or_none(xpath_text(a, './frameHeight')),
359 'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
360 'abr': int_or_none(xpath_text(a, './bitrateAudio')),
361 'vcodec': xpath_text(a, './codecVideo'),
362 'tbr': int_or_none(xpath_text(a, './totalBitrate')),
363 }
364 server_prefix = xpath_text(a, './serverPrefix', default=None)
365 if server_prefix:
366 f.update({
367 'url': server_prefix,
368 'playpath': file_name,
369 })
370 else:
371 if not format_url:
372 continue
373 f['url'] = format_url
374 formats.append(f)
375
376 _SUB_FORMATS = (
377 ('./dataTimedText', 'ttml'),
378 ('./dataTimedTextNoOffset', 'ttml'),
379 ('./dataTimedTextVtt', 'vtt'),
380 )
381
382 subtitles = {}
383 for subsel, subext in _SUB_FORMATS:
384 for node in video_node.findall(subsel):
385 subtitles.setdefault('de', []).append({
386 'url': node.attrib['url'],
387 'ext': subext,
388 })
389
390 return {
391 'id': xpath_text(video_node, './videoId', default=display_id),
392 'formats': formats,
393 'subtitles': subtitles,
394 'display_id': display_id,
395 'title': video_node.find('./title').text,
396 'duration': parse_duration(video_node.find('./duration').text),
397 'upload_date': upload_date,
398 'thumbnail': thumbnail,
399 }
400
401
402 class ARDBetaMediathekIE(ARDMediathekBaseIE):
403 _VALID_URL = r'''(?x)https://
404 (?:(?:beta|www)\.)?ardmediathek\.de/
405 (?:(?P<client>[^/]+)/)?
406 (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
407 (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
408 (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
409 (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
410
411 _TESTS = [{
412 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
413 'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
414 'info_dict': {
415 'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
416 'id': '12939099',
417 'title': 'Liebe auf vier Pfoten',
418 'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
419 'duration': 5222,
420 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
421 'timestamp': 1701343800,
422 'upload_date': '20231130',
423 'ext': 'mp4',
424 'episode': 'Liebe auf vier Pfoten',
425 'series': 'Filme im MDR'
426 },
427 }, {
428 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
429 'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
430 'info_dict': {
431 'display_id': 'die-robuste-roswita',
432 'id': '78566716',
433 'title': 'Die robuste Roswita',
434 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita',
435 'duration': 5316,
436 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard',
437 'timestamp': 1596658200,
438 'upload_date': '20200805',
439 'ext': 'mp4',
440 },
441 'skip': 'Error',
442 }, {
443 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
444 'md5': '1e73ded21cb79bac065117e80c81dc88',
445 'info_dict': {
446 'id': '10049223',
447 'ext': 'mp4',
448 'title': 'tagesschau, 20:00 Uhr',
449 'timestamp': 1636398000,
450 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
451 'upload_date': '20211108',
452 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste',
453 'duration': 915,
454 'episode': 'tagesschau, 20:00 Uhr',
455 'series': 'tagesschau',
456 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
457 },
458 }, {
459 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
460 'only_matching': True,
461 }, {
462 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
463 'only_matching': True,
464 }, {
465 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
466 'only_matching': True,
467 }, {
468 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
469 'only_matching': True,
470 }, {
471 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
472 'only_matching': True,
473 }, {
474 # playlist of type 'sendung'
475 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
476 'only_matching': True,
477 }, {
478 # playlist of type 'serie'
479 'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
480 'only_matching': True,
481 }, {
482 # playlist of type 'sammlung'
483 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
484 'only_matching': True,
485 }, {
486 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
487 'only_matching': True,
488 }, {
489 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
490 'only_matching': True,
491 }]
492
493 def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
494 """ Query the ARD server for playlist information
495 and returns the data in "raw" format """
496 assert mode in ('sendung', 'serie', 'sammlung')
497 if mode in ('sendung', 'serie'):
498 graphQL = json.dumps({
499 'query': '''{
500 showPage(
501 client: "%s"
502 showId: "%s"
503 pageNumber: %d
504 ) {
505 pagination {
506 pageSize
507 totalElements
508 }
509 teasers { # Array
510 mediumTitle
511 links { target { id href title } }
512 type
513 }
514 }}''' % (client, playlist_id, page_number),
515 }).encode()
516 else: # mode == 'sammlung'
517 graphQL = json.dumps({
518 'query': '''{
519 morePage(
520 client: "%s"
521 compilationId: "%s"
522 pageNumber: %d
523 ) {
524 widget {
525 pagination {
526 pageSize
527 totalElements
528 }
529 teasers { # Array
530 mediumTitle
531 links { target { id href title } }
532 type
533 }
534 }
535 }}''' % (client, playlist_id, page_number),
536 }).encode()
537 # Ressources for ARD graphQL debugging:
538 # https://api-test.ardmediathek.de/public-gateway
539 show_page = self._download_json(
540 'https://api.ardmediathek.de/public-gateway',
541 '[Playlist] %s' % display_id,
542 data=graphQL,
543 headers={'Content-Type': 'application/json'})['data']
544 # align the structure of the returned data:
545 if mode in ('sendung', 'serie'):
546 show_page = show_page['showPage']
547 else: # mode == 'sammlung'
548 show_page = show_page['morePage']['widget']
549 return show_page
550
551 def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
552 """ Collects all playlist entries and returns them as info dict.
553 Supports playlists of mode 'sendung', 'serie', and 'sammlung',
554 as well as nested playlists. """
555 entries = []
556 pageNumber = 0
557 while True: # iterate by pageNumber
558 show_page = self._ARD_load_playlist_snippet(
559 playlist_id, display_id, client, mode, pageNumber)
560 for teaser in show_page['teasers']: # process playlist items
561 if '/compilation/' in teaser['links']['target']['href']:
562 # alternativ cond.: teaser['type'] == "compilation"
563 # => This is an nested compilation, e.g. like:
564 # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
565 link_mode = 'sammlung'
566 else:
567 link_mode = 'video'
568
569 item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
570 client, link_mode, display_id,
571 # perform HTLM quoting of episode title similar to ARD:
572 re.sub('^-|-$', '', # remove '-' from begin/end
573 re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by -
574 teaser['links']['target']['title'].lower()
575 .replace('ä', 'ae').replace('ö', 'oe')
576 .replace('ü', 'ue').replace('ß', 'ss'))),
577 teaser['links']['target']['id'])
578 entries.append(self.url_result(
579 item_url,
580 ie=ARDBetaMediathekIE.ie_key()))
581
582 if (show_page['pagination']['pageSize'] * (pageNumber + 1)
583 >= show_page['pagination']['totalElements']):
584 # we've processed enough pages to get all playlist entries
585 break
586 pageNumber = pageNumber + 1
587
588 return self.playlist_result(entries, playlist_id, playlist_title=display_id)
589
590 def _real_extract(self, url):
591 video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
592 'id', 'display_id', 'playlist', 'client', 'season')
593 display_id, client = display_id or video_id, client or 'ard'
594
595 if playlist_type:
596 # TODO: Extract only specified season
597 return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
598
599 player_page = self._download_json(
600 'https://api.ardmediathek.de/public-gateway',
601 display_id, data=json.dumps({
602 'query': '''{
603 playerPage(client:"%s", clipId: "%s") {
604 blockedByFsk
605 broadcastedOn
606 maturityContentRating
607 mediaCollection {
608 _duration
609 _geoblocked
610 _isLive
611 _mediaArray {
612 _mediaStreamArray {
613 _quality
614 _server
615 _stream
616 }
617 }
618 _previewImage
619 _subtitleUrl
620 _type
621 }
622 show {
623 title
624 }
625 image {
626 src
627 }
628 synopsis
629 title
630 tracking {
631 atiCustomVars {
632 contentId
633 }
634 }
635 }
636 }''' % (client, video_id),
637 }).encode(), headers={
638 'Content-Type': 'application/json'
639 })['data']['playerPage']
640 title = player_page['title']
641 content_id = str_or_none(try_get(
642 player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
643 media_collection = player_page.get('mediaCollection') or {}
644 if not media_collection and content_id:
645 media_collection = self._download_json(
646 'https://www.ardmediathek.de/play/media/' + content_id,
647 content_id, fatal=False) or {}
648 info = self._parse_media_info(
649 media_collection, content_id or video_id,
650 player_page.get('blockedByFsk'))
651 age_limit = None
652 description = player_page.get('synopsis')
653 maturity_content_rating = player_page.get('maturityContentRating')
654 if maturity_content_rating:
655 age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
656 if not age_limit and description:
657 age_limit = int_or_none(self._search_regex(
658 r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
659 info.update({
660 'age_limit': age_limit,
661 'display_id': display_id,
662 'title': title,
663 'description': description,
664 'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
665 'series': try_get(player_page, lambda x: x['show']['title']),
666 'thumbnail': (media_collection.get('_previewImage')
667 or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
668 or self.get_thumbnail_from_html(display_id, url)),
669 })
670 info.update(self._ARD_extract_episode_info(info['title']))
671 return info
672
673 def get_thumbnail_from_html(self, display_id, url):
674 webpage = self._download_webpage(url, display_id, fatal=False) or ''
675 return (
676 self._og_search_thumbnail(webpage, default=None)
677 or self._html_search_meta('thumbnailUrl', webpage, default=None))