]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/ard.py
Update to ytdl-commit-be008e6 (#8836)
[yt-dlp.git] / yt_dlp / extractor / ard.py
CommitLineData
c968f738 1import json
d5822b96
PH
2import re
3
4from .common import InfoExtractor
3741302a 5from .generic import GenericIE
d5822b96 6from ..utils import (
f9b85496 7 determine_ext,
d5822b96 8 ExtractorError,
6d3d3fc0
PH
9 int_or_none,
10 parse_duration,
75258218
S
11 qualities,
12 str_or_none,
13 try_get,
6d3d3fc0 14 unified_strdate,
75258218 15 unified_timestamp,
f78eb41e 16 update_url,
31eeab9f 17 update_url_query,
3052a30d 18 url_or_none,
75258218 19 xpath_text,
d5822b96 20)
f7854627 21from ..compat import compat_etree_fromstring
d5822b96 22
f9b85496 23
c968f738
RA
24class ARDMediathekBaseIE(InfoExtractor):
25 _GEO_COUNTRIES = ['DE']
1c821227 26
e37c92ec
S
27 def _extract_media_info(self, media_info_url, webpage, video_id):
28 media_info = self._download_json(
29 media_info_url, video_id, 'Downloading media JSON')
c968f738 30 return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
e37c92ec 31
c968f738 32 def _parse_media_info(self, media_info, video_id, fsk):
e37c92ec
S
33 formats = self._extract_formats(media_info, video_id)
34
35 if not formats:
c968f738 36 if fsk:
b7da73eb 37 self.raise_no_formats(
e37c92ec
S
38 'This video is only available after 20:00', expected=True)
39 elif media_info.get('_geoblocked'):
c968f738
RA
40 self.raise_geo_restricted(
41 'This video is not available due to geoblocking',
b7da73eb 42 countries=self._GEO_COUNTRIES, metadata_available=True)
e37c92ec 43
e37c92ec
S
44 subtitles = {}
45 subtitle_url = media_info.get('_subtitleUrl')
46 if subtitle_url:
47 subtitles['de'] = [{
ffa2cecf 48 'ext': 'ttml',
e37c92ec 49 'url': subtitle_url,
d61ef7f3
GM
50 }, {
51 'ext': 'vtt',
52 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
e37c92ec
S
53 }]
54
55 return {
56 'id': video_id,
c968f738
RA
57 'duration': int_or_none(media_info.get('_duration')),
58 'thumbnail': media_info.get('_previewImage'),
59 'is_live': media_info.get('_isLive') is True,
e37c92ec
S
60 'formats': formats,
61 'subtitles': subtitles,
62 }
63
e6e5d98c 64 def _ARD_extract_episode_info(self, title):
65 """Try to extract season/episode data from the title."""
66 res = {}
67 if not title:
68 return res
69
70 for pattern in [
71 # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
72 # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
73 r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
74 # E.g.: title="Fritjof aus Norwegen (2) (AD)"
75 # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
76 r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
77 r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
78 # E.g.: title="Folge 25/42: Symmetrie"
79 # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
80 # E.g.: title="Folge 1063 - Vertrauen"
81 # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
82 r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
83 ]:
84 m = re.match(pattern, title)
85 if m:
86 groupdict = m.groupdict()
87 res['season_number'] = int_or_none(groupdict.get('season_number'))
88 res['episode_number'] = int_or_none(groupdict.get('episode_number'))
89 res['episode'] = str_or_none(groupdict.get('episode'))
90 # Build the episode title by removing numeric episode information:
91 if groupdict.get('ep_info') and not res['episode']:
92 res['episode'] = str_or_none(
93 title.replace(groupdict.get('ep_info'), ''))
94 if res['episode']:
95 res['episode'] = res['episode'].strip()
96 break
97
98 # As a fallback use the whole title as the episode name:
99 if not res.get('episode'):
100 res['episode'] = title.strip()
101 return res
102
e37c92ec
S
103 def _extract_formats(self, media_info, video_id):
104 type_ = media_info.get('_type')
105 media_array = media_info.get('_mediaArray', [])
106 formats = []
107 for num, media in enumerate(media_array):
108 for stream in media.get('_mediaStreamArray', []):
109 stream_urls = stream.get('_stream')
110 if not stream_urls:
111 continue
112 if not isinstance(stream_urls, list):
113 stream_urls = [stream_urls]
114 quality = stream.get('_quality')
115 server = stream.get('_server')
116 for stream_url in stream_urls:
3052a30d 117 if not url_or_none(stream_url):
91328f26 118 continue
e37c92ec 119 ext = determine_ext(stream_url)
1fc0b47f 120 if quality != 'auto' and ext in ('f4m', 'm3u8'):
121 continue
e37c92ec 122 if ext == 'f4m':
7e5edcfd 123 formats.extend(self._extract_f4m_formats(
31eeab9f
RA
124 update_url_query(stream_url, {
125 'hdcore': '3.1.1',
126 'plugin': 'aasp-3.1.1.69.124'
c968f738 127 }), video_id, f4m_id='hds', fatal=False))
e37c92ec 128 elif ext == 'm3u8':
7e5edcfd 129 formats.extend(self._extract_m3u8_formats(
c968f738
RA
130 stream_url, video_id, 'mp4', 'm3u8_native',
131 m3u8_id='hls', fatal=False))
e37c92ec
S
132 else:
133 if server and server.startswith('rtmp'):
134 f = {
135 'url': server,
136 'play_path': stream_url,
137 'format_id': 'a%s-rtmp-%s' % (num, quality),
138 }
91328f26 139 else:
e37c92ec
S
140 f = {
141 'url': stream_url,
142 'format_id': 'a%s-%s-%s' % (num, ext, quality)
143 }
c968f738
RA
144 m = re.search(
145 r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
146 stream_url)
e37c92ec
S
147 if m:
148 f.update({
149 'width': int(m.group('width')),
150 'height': int(m.group('height')),
151 })
152 if type_ == 'audio':
153 f['vcodec'] = 'none'
154 formats.append(f)
155 return formats
156
c968f738
RA
157
158class ARDMediathekIE(ARDMediathekBaseIE):
159 IE_NAME = 'ARD:mediathek'
160 _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
161
162 _TESTS = [{
163 # available till 26.07.2022
164 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
165 'info_dict': {
166 'id': '44726822',
167 'ext': 'mp4',
168 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
169 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
170 'duration': 1740,
171 },
172 'params': {
173 # m3u8 download
174 'skip_download': True,
175 }
176 }, {
177 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
178 'only_matching': True,
179 }, {
180 # audio
181 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
182 'only_matching': True,
183 }, {
184 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
185 'only_matching': True,
186 }, {
187 # audio
188 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
189 'only_matching': True,
190 }, {
191 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
192 'only_matching': True,
193 }]
194
195 @classmethod
196 def suitable(cls, url):
197 return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
198
d5822b96
PH
199 def _real_extract(self, url):
200 # determine video id from url
5ad28e7f 201 m = self._match_valid_url(url)
d5822b96 202
ca127ab2
S
203 document_id = None
204
d5822b96
PH
205 numid = re.search(r'documentId=([0-9]+)', url)
206 if numid:
ca127ab2 207 document_id = video_id = numid.group(1)
d5822b96
PH
208 else:
209 video_id = m.group('video_id')
210
5622f29a 211 webpage = self._download_webpage(url, video_id)
f9b85496 212
3791d84a
S
213 ERRORS = (
214 ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
215 ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
216 'Video %s is no longer available'),
3791d84a
S
217 )
218
219 for pattern, message in ERRORS:
220 if pattern in webpage:
221 raise ExtractorError(message % video_id, expected=True)
39aa42ff 222
bfd91588 223 if re.search(r'[\?&]rss($|[=&])', url):
f7854627 224 doc = compat_etree_fromstring(webpage.encode('utf-8'))
3741302a
OE
225 if doc.tag == 'rss':
226 return GenericIE()._extract_rss(url, video_id, doc)
227
a820dc72 228 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
0f97c9a0 229 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
197224b7 230 r'<meta name="dcterms\.title" content="(.*?)"/>',
8c587971
AS
231 r'<h4 class="headline">(.*?)</h4>',
232 r'<title[^>]*>(.*?)</title>'],
0f97c9a0 233 webpage, 'title')
a820dc72 234 description = self._og_search_description(webpage, default=None) or self._html_search_meta(
29546b34
PH
235 'dcterms.abstract', webpage, 'description', default=None)
236 if description is None:
237 description = self._html_search_meta(
8c587971
AS
238 'description', webpage, 'meta description', default=None)
239 if description is None:
240 description = self._html_search_regex(
241 r'<p\s+class="teasertext">(.+?)</p>',
242 webpage, 'teaser text', default=None)
29546b34
PH
243
244 # Thumbnail is sometimes not present.
245 # It is in the mobile version, but that seems to use a different URL
246 # structure altogether.
247 thumbnail = self._og_search_thumbnail(webpage, default=None)
248
249 media_streams = re.findall(r'''(?x)
250 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
251 "([^"]+)"''', webpage)
252
253 if media_streams:
254 QUALITIES = qualities(['lo', 'hi', 'hq'])
255 formats = []
256 for furl in set(media_streams):
257 if furl.endswith('.f4m'):
258 fid = 'f4m'
259 else:
260 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
261 fid = fid_m.group(1) if fid_m else None
262 formats.append({
263 'quality': QUALITIES(fid),
264 'format_id': fid,
265 'url': furl,
266 })
e37c92ec
S
267 info = {
268 'formats': formats,
269 }
29546b34 270 else: # request JSON file
ca127ab2
S
271 if not document_id:
272 video_id = self._search_regex(
3f6a90eb 273 (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
274 webpage, 'media id', default=None)
e37c92ec 275 info = self._extract_media_info(
ca127ab2
S
276 'http://www.ardmediathek.de/play/media/%s' % video_id,
277 webpage, video_id)
f9b85496 278
e37c92ec 279 info.update({
f9b85496 280 'id': video_id,
39ca3b5c 281 'title': title,
f9b85496 282 'description': description,
f9b85496 283 'thumbnail': thumbnail,
e37c92ec 284 })
e6e5d98c 285 info.update(self._ARD_extract_episode_info(info['title']))
e37c92ec
S
286
287 return info
6d3d3fc0
PH
288
289
290class ARDIE(InfoExtractor):
14eb1ee1 291 _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
ad29ef04 292 _TESTS = [{
d61ef7f3
GM
293 # available till 7.12.2023
294 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
1f8bd8eb 295 'md5': '94812e6438488fb923c361a44469614b',
ad29ef04 296 'info_dict': {
d61ef7f3
GM
297 'id': 'maischberger-video-424',
298 'display_id': 'maischberger-video-424',
ad29ef04 299 'ext': 'mp4',
d61ef7f3
GM
300 'duration': 4452.0,
301 'title': 'maischberger am 07.12.2022',
302 'upload_date': '20221207',
ad29ef04
W
303 'thumbnail': r're:^https?://.*\.jpg$',
304 },
a54c5f83 305 }, {
14eb1ee1 306 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
307 'only_matching': True,
308 }, {
309 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
a54c5f83 310 'only_matching': True,
f17c7022
OF
311 }, {
312 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
313 'only_matching': True,
d6a03502 314 }, {
6d3d3fc0 315 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
d6a03502 316 'only_matching': True,
14eb1ee1 317 }, {
318 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
319 'only_matching': True,
320 }, {
321 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
322 'only_matching': True,
ad29ef04 323 }]
6d3d3fc0
PH
324
325 def _real_extract(self, url):
5ad28e7f 326 mobj = self._match_valid_url(url)
14eb1ee1 327 display_id = mobj.group('id')
6d3d3fc0
PH
328
329 player_url = mobj.group('mainurl') + '~playerXml.xml'
330 doc = self._download_xml(player_url, display_id)
331 video_node = doc.find('./video')
bf0ff932
PH
332 upload_date = unified_strdate(xpath_text(
333 video_node, './broadcastDate'))
334 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
6d3d3fc0
PH
335
336 formats = []
337 for a in video_node.findall('.//asset'):
bc2ca1bb 338 file_name = xpath_text(a, './fileName', default=None)
339 if not file_name:
340 continue
341 format_type = a.attrib.get('type')
342 format_url = url_or_none(file_name)
343 if format_url:
344 ext = determine_ext(file_name)
345 if ext == 'm3u8':
346 formats.extend(self._extract_m3u8_formats(
347 format_url, display_id, 'mp4', entry_protocol='m3u8_native',
348 m3u8_id=format_type or 'hls', fatal=False))
349 continue
350 elif ext == 'f4m':
351 formats.extend(self._extract_f4m_formats(
352 update_url_query(format_url, {'hdcore': '3.7.0'}),
353 display_id, f4m_id=format_type or 'hds', fatal=False))
354 continue
6d3d3fc0 355 f = {
bc2ca1bb 356 'format_id': format_type,
357 'width': int_or_none(xpath_text(a, './frameWidth')),
358 'height': int_or_none(xpath_text(a, './frameHeight')),
359 'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
360 'abr': int_or_none(xpath_text(a, './bitrateAudio')),
361 'vcodec': xpath_text(a, './codecVideo'),
362 'tbr': int_or_none(xpath_text(a, './totalBitrate')),
6d3d3fc0 363 }
bc2ca1bb 364 server_prefix = xpath_text(a, './serverPrefix', default=None)
365 if server_prefix:
366 f.update({
367 'url': server_prefix,
368 'playpath': file_name,
369 })
6d3d3fc0 370 else:
bc2ca1bb 371 if not format_url:
372 continue
373 f['url'] = format_url
6d3d3fc0 374 formats.append(f)
6d3d3fc0 375
ad9158d5
F
376 _SUB_FORMATS = (
377 ('./dataTimedText', 'ttml'),
378 ('./dataTimedTextNoOffset', 'ttml'),
379 ('./dataTimedTextVtt', 'vtt'),
380 )
381
382 subtitles = {}
383 for subsel, subext in _SUB_FORMATS:
384 for node in video_node.findall(subsel):
385 subtitles.setdefault('de', []).append({
386 'url': node.attrib['url'],
387 'ext': subext,
388 })
389
6d3d3fc0 390 return {
14eb1ee1 391 'id': xpath_text(video_node, './videoId', default=display_id),
6d3d3fc0 392 'formats': formats,
ad9158d5 393 'subtitles': subtitles,
6d3d3fc0
PH
394 'display_id': display_id,
395 'title': video_node.find('./title').text,
396 'duration': parse_duration(video_node.find('./duration').text),
397 'upload_date': upload_date,
398 'thumbnail': thumbnail,
399 }
c1a37eb2
PH
400
401
c968f738 402class ARDBetaMediathekIE(ARDMediathekBaseIE):
14a08605 403 _VALID_URL = r'''(?x)https://
404 (?:(?:beta|www)\.)?ardmediathek\.de/
405 (?:(?P<client>[^/]+)/)?
1f8bd8eb 406 (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
409cdd1e 407 (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
408 (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
409 (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
14a08605 410
c1a37eb2 411 _TESTS = [{
1f8bd8eb
LS
412 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
413 'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
f78eb41e 414 'info_dict': {
1f8bd8eb
LS
415 'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
416 'id': '12939099',
417 'title': 'Liebe auf vier Pfoten',
418 'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
419 'duration': 5222,
420 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
421 'timestamp': 1701343800,
422 'upload_date': '20231130',
f78eb41e 423 'ext': 'mp4',
1f8bd8eb 424 'episode': 'Liebe auf vier Pfoten',
f78eb41e
SL
425 'series': 'Filme im MDR'
426 },
427 }, {
a820dc72
RA
428 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
429 'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
c1a37eb2
PH
430 'info_dict': {
431 'display_id': 'die-robuste-roswita',
a820dc72 432 'id': '78566716',
c968f738 433 'title': 'Die robuste Roswita',
a820dc72 434 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita',
c1a37eb2 435 'duration': 5316,
a820dc72
RA
436 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard',
437 'timestamp': 1596658200,
438 'upload_date': '20200805',
c1a37eb2
PH
439 'ext': 'mp4',
440 },
14a08605 441 'skip': 'Error',
442 }, {
443 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
f78eb41e 444 'md5': '1e73ded21cb79bac065117e80c81dc88',
14a08605 445 'info_dict': {
446 'id': '10049223',
447 'ext': 'mp4',
448 'title': 'tagesschau, 20:00 Uhr',
449 'timestamp': 1636398000,
450 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
451 'upload_date': '20211108',
f78eb41e
SL
452 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste',
453 'duration': 915,
454 'episode': 'tagesschau, 20:00 Uhr',
455 'series': 'tagesschau',
1f8bd8eb 456 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
409cdd1e 457 },
fe515e5c
S
458 }, {
459 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
460 'only_matching': True,
461 }, {
462 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
463 'only_matching': True,
464 }, {
465 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
466 'only_matching': True,
1c821227
S
467 }, {
468 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
469 'only_matching': True,
470 }, {
471 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
472 'only_matching': True,
e6e5d98c 473 }, {
474 # playlist of type 'sendung'
475 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
476 'only_matching': True,
1f8bd8eb
LS
477 }, {
478 # playlist of type 'serie'
479 'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
480 'only_matching': True,
e6e5d98c 481 }, {
482 # playlist of type 'sammlung'
483 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
484 'only_matching': True,
14a08605 485 }, {
486 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
487 'only_matching': True,
488 }, {
489 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
490 'only_matching': True,
c1a37eb2
PH
491 }]
492
1f8bd8eb 493 def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
e6e5d98c 494 """ Query the ARD server for playlist information
495 and returns the data in "raw" format """
1f8bd8eb
LS
496 assert mode in ('sendung', 'serie', 'sammlung')
497 if mode in ('sendung', 'serie'):
e6e5d98c 498 graphQL = json.dumps({
499 'query': '''{
500 showPage(
501 client: "%s"
502 showId: "%s"
503 pageNumber: %d
504 ) {
505 pagination {
506 pageSize
507 totalElements
508 }
509 teasers { # Array
510 mediumTitle
511 links { target { id href title } }
512 type
513 }
1f8bd8eb 514 }}''' % (client, playlist_id, page_number),
e6e5d98c 515 }).encode()
516 else: # mode == 'sammlung'
517 graphQL = json.dumps({
518 'query': '''{
519 morePage(
520 client: "%s"
521 compilationId: "%s"
522 pageNumber: %d
523 ) {
524 widget {
525 pagination {
526 pageSize
527 totalElements
528 }
529 teasers { # Array
530 mediumTitle
531 links { target { id href title } }
532 type
533 }
534 }
1f8bd8eb 535 }}''' % (client, playlist_id, page_number),
e6e5d98c 536 }).encode()
537 # Ressources for ARD graphQL debugging:
538 # https://api-test.ardmediathek.de/public-gateway
539 show_page = self._download_json(
540 'https://api.ardmediathek.de/public-gateway',
541 '[Playlist] %s' % display_id,
542 data=graphQL,
543 headers={'Content-Type': 'application/json'})['data']
544 # align the structure of the returned data:
1f8bd8eb 545 if mode in ('sendung', 'serie'):
e6e5d98c 546 show_page = show_page['showPage']
547 else: # mode == 'sammlung'
548 show_page = show_page['morePage']['widget']
549 return show_page
550
551 def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
552 """ Collects all playlist entries and returns them as info dict.
1f8bd8eb
LS
553 Supports playlists of mode 'sendung', 'serie', and 'sammlung',
554 as well as nested playlists. """
e6e5d98c 555 entries = []
556 pageNumber = 0
557 while True: # iterate by pageNumber
1f8bd8eb 558 show_page = self._ARD_load_playlist_snippet(
e6e5d98c 559 playlist_id, display_id, client, mode, pageNumber)
560 for teaser in show_page['teasers']: # process playlist items
561 if '/compilation/' in teaser['links']['target']['href']:
562 # alternativ cond.: teaser['type'] == "compilation"
563 # => This is an nested compilation, e.g. like:
564 # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
565 link_mode = 'sammlung'
566 else:
567 link_mode = 'video'
568
569 item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
570 client, link_mode, display_id,
571 # perform HTLM quoting of episode title similar to ARD:
572 re.sub('^-|-$', '', # remove '-' from begin/end
573 re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by -
574 teaser['links']['target']['title'].lower()
575 .replace('ä', 'ae').replace('ö', 'oe')
576 .replace('ü', 'ue').replace('ß', 'ss'))),
577 teaser['links']['target']['id'])
578 entries.append(self.url_result(
579 item_url,
580 ie=ARDBetaMediathekIE.ie_key()))
581
582 if (show_page['pagination']['pageSize'] * (pageNumber + 1)
583 >= show_page['pagination']['totalElements']):
584 # we've processed enough pages to get all playlist entries
585 break
586 pageNumber = pageNumber + 1
587
409cdd1e 588 return self.playlist_result(entries, playlist_id, playlist_title=display_id)
e6e5d98c 589
c1a37eb2 590 def _real_extract(self, url):
409cdd1e 591 video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
592 'id', 'display_id', 'playlist', 'client', 'season')
14a08605 593 display_id, client = display_id or video_id, client or 'ard'
594
595 if playlist_type:
409cdd1e 596 # TODO: Extract only specified season
14a08605 597 return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
e6e5d98c 598
c968f738
RA
599 player_page = self._download_json(
600 'https://api.ardmediathek.de/public-gateway',
b704fc1a 601 display_id, data=json.dumps({
c968f738 602 'query': '''{
b704fc1a 603 playerPage(client:"%s", clipId: "%s") {
c968f738
RA
604 blockedByFsk
605 broadcastedOn
606 maturityContentRating
607 mediaCollection {
608 _duration
609 _geoblocked
610 _isLive
611 _mediaArray {
612 _mediaStreamArray {
613 _quality
614 _server
615 _stream
c1a37eb2 616 }
c968f738
RA
617 }
618 _previewImage
619 _subtitleUrl
620 _type
621 }
622 show {
623 title
624 }
f78eb41e
SL
625 image {
626 src
627 }
c968f738
RA
628 synopsis
629 title
630 tracking {
631 atiCustomVars {
632 contentId
633 }
634 }
635 }
14a08605 636}''' % (client, video_id),
c968f738
RA
637 }).encode(), headers={
638 'Content-Type': 'application/json'
639 })['data']['playerPage']
640 title = player_page['title']
641 content_id = str_or_none(try_get(
642 player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
643 media_collection = player_page.get('mediaCollection') or {}
644 if not media_collection and content_id:
645 media_collection = self._download_json(
646 'https://www.ardmediathek.de/play/media/' + content_id,
647 content_id, fatal=False) or {}
648 info = self._parse_media_info(
649 media_collection, content_id or video_id,
650 player_page.get('blockedByFsk'))
651 age_limit = None
652 description = player_page.get('synopsis')
653 maturity_content_rating = player_page.get('maturityContentRating')
654 if maturity_content_rating:
655 age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
f4a18db7
RA
656 if not age_limit and description:
657 age_limit = int_or_none(self._search_regex(
658 r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
c968f738
RA
659 info.update({
660 'age_limit': age_limit,
b704fc1a 661 'display_id': display_id,
c968f738
RA
662 'title': title,
663 'description': description,
664 'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
665 'series': try_get(player_page, lambda x: x['show']['title']),
f78eb41e
SL
666 'thumbnail': (media_collection.get('_previewImage')
667 or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
668 or self.get_thumbnail_from_html(display_id, url)),
75258218 669 })
e6e5d98c 670 info.update(self._ARD_extract_episode_info(info['title']))
c968f738 671 return info
f78eb41e
SL
672
673 def get_thumbnail_from_html(self, display_id, url):
674 webpage = self._download_webpage(url, display_id, fatal=False) or ''
675 return (
676 self._og_search_thumbnail(webpage, default=None)
677 or self._html_search_meta('thumbnailUrl', webpage, default=None))