yt_dlp/extractor/ard.py

   1 import json
   2 import re
   3
   4 from .common import InfoExtractor
   5 from .generic import GenericIE
   6 from ..utils import (
   7     determine_ext,
   8     ExtractorError,
   9     int_or_none,
  10     parse_duration,
  11     qualities,
  12     str_or_none,
  13     try_get,
  14     unified_strdate,
  15     unified_timestamp,
  16     update_url,
  17     update_url_query,
  18     url_or_none,
  19     xpath_text,
  20 )
  21 from ..compat import compat_etree_fromstring
  22
  23
  24 class ARDMediathekBaseIE(InfoExtractor):
  25     _GEO_COUNTRIES = ['DE']
  26
  27     def _extract_media_info(self, media_info_url, webpage, video_id):
  28         media_info = self._download_json(
  29             media_info_url, video_id, 'Downloading media JSON')
  30         return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
  31
  32     def _parse_media_info(self, media_info, video_id, fsk):
  33         formats = self._extract_formats(media_info, video_id)
  34
  35         if not formats:
  36             if fsk:
  37                 self.raise_no_formats(
  38                     'This video is only available after 20:00', expected=True)
  39             elif media_info.get('_geoblocked'):
  40                 self.raise_geo_restricted(
  41                     'This video is not available due to geoblocking',
  42                     countries=self._GEO_COUNTRIES, metadata_available=True)
  43
  44         subtitles = {}
  45         subtitle_url = media_info.get('_subtitleUrl')
  46         if subtitle_url:
  47             subtitles['de'] = [{
  48                 'ext': 'ttml',
  49                 'url': subtitle_url,
  50             }, {
  51                 'ext': 'vtt',
  52                 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
  53             }]
  54
  55         return {
  56             'id': video_id,
  57             'duration': int_or_none(media_info.get('_duration')),
  58             'thumbnail': media_info.get('_previewImage'),
  59             'is_live': media_info.get('_isLive') is True,
  60             'formats': formats,
  61             'subtitles': subtitles,
  62         }
  63
  64     def _ARD_extract_episode_info(self, title):
  65         """Try to extract season/episode data from the title."""
  66         res = {}
  67         if not title:
  68             return res
  69
  70         for pattern in [
  71             # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
  72             # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
  73             r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
  74             # E.g.: title="Fritjof aus Norwegen (2) (AD)"
  75             # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
  76             r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
  77             r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
  78             # E.g.: title="Folge 25/42: Symmetrie"
  79             # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
  80             # E.g.: title="Folge 1063 - Vertrauen"
  81             # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
  82             r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
  83         ]:
  84             m = re.match(pattern, title)
  85             if m:
  86                 groupdict = m.groupdict()
  87                 res['season_number'] = int_or_none(groupdict.get('season_number'))
  88                 res['episode_number'] = int_or_none(groupdict.get('episode_number'))
  89                 res['episode'] = str_or_none(groupdict.get('episode'))
  90                 # Build the episode title by removing numeric episode information:
  91                 if groupdict.get('ep_info') and not res['episode']:
  92                     res['episode'] = str_or_none(
  93                         title.replace(groupdict.get('ep_info'), ''))
  94                 if res['episode']:
  95                     res['episode'] = res['episode'].strip()
  96                 break
  97
  98         # As a fallback use the whole title as the episode name:
  99         if not res.get('episode'):
 100             res['episode'] = title.strip()
 101         return res
 102
 103     def _extract_formats(self, media_info, video_id):
 104         type_ = media_info.get('_type')
 105         media_array = media_info.get('_mediaArray', [])
 106         formats = []
 107         for num, media in enumerate(media_array):
 108             for stream in media.get('_mediaStreamArray', []):
 109                 stream_urls = stream.get('_stream')
 110                 if not stream_urls:
 111                     continue
 112                 if not isinstance(stream_urls, list):
 113                     stream_urls = [stream_urls]
 114                 quality = stream.get('_quality')
 115                 server = stream.get('_server')
 116                 for stream_url in stream_urls:
 117                     if not url_or_none(stream_url):
 118                         continue
 119                     ext = determine_ext(stream_url)
 120                     if quality != 'auto' and ext in ('f4m', 'm3u8'):
 121                         continue
 122                     if ext == 'f4m':
 123                         formats.extend(self._extract_f4m_formats(
 124                             update_url_query(stream_url, {
 125                                 'hdcore': '3.1.1',
 126                                 'plugin': 'aasp-3.1.1.69.124'
 127                             }), video_id, f4m_id='hds', fatal=False))
 128                     elif ext == 'm3u8':
 129                         formats.extend(self._extract_m3u8_formats(
 130                             stream_url, video_id, 'mp4', 'm3u8_native',
 131                             m3u8_id='hls', fatal=False))
 132                     else:
 133                         if server and server.startswith('rtmp'):
 134                             f = {
 135                                 'url': server,
 136                                 'play_path': stream_url,
 137                                 'format_id': 'a%s-rtmp-%s' % (num, quality),
 138                             }
 139                         else:
 140                             f = {
 141                                 'url': stream_url,
 142                                 'format_id': 'a%s-%s-%s' % (num, ext, quality)
 143                             }
 144                         m = re.search(
 145                             r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
 146                             stream_url)
 147                         if m:
 148                             f.update({
 149                                 'width': int(m.group('width')),
 150                                 'height': int(m.group('height')),
 151                             })
 152                         if type_ == 'audio':
 153                             f['vcodec'] = 'none'
 154                         formats.append(f)
 155         return formats
 156
 157
 158 class ARDMediathekIE(ARDMediathekBaseIE):
 159     IE_NAME = 'ARD:mediathek'
 160     _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
 161
 162     _TESTS = [{
 163         # available till 26.07.2022
 164         'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
 165         'info_dict': {
 166             'id': '44726822',
 167             'ext': 'mp4',
 168             'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
 169             'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
 170             'duration': 1740,
 171         },
 172         'params': {
 173             # m3u8 download
 174             'skip_download': True,
 175         }
 176     }, {
 177         'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
 178         'only_matching': True,
 179     }, {
 180         # audio
 181         'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
 182         'only_matching': True,
 183     }, {
 184         'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
 185         'only_matching': True,
 186     }, {
 187         # audio
 188         'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
 189         'only_matching': True,
 190     }, {
 191         'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
 192         'only_matching': True,
 193     }]
 194
 195     @classmethod
 196     def suitable(cls, url):
 197         return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
 198
 199     def _real_extract(self, url):
 200         # determine video id from url
 201         m = self._match_valid_url(url)
 202
 203         document_id = None
 204
 205         numid = re.search(r'documentId=([0-9]+)', url)
 206         if numid:
 207             document_id = video_id = numid.group(1)
 208         else:
 209             video_id = m.group('video_id')
 210
 211         webpage = self._download_webpage(url, video_id)
 212
 213         ERRORS = (
 214             ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
 215             ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
 216              'Video %s is no longer available'),
 217         )
 218
 219         for pattern, message in ERRORS:
 220             if pattern in webpage:
 221                 raise ExtractorError(message % video_id, expected=True)
 222
 223         if re.search(r'[\?&]rss($|[=&])', url):
 224             doc = compat_etree_fromstring(webpage.encode('utf-8'))
 225             if doc.tag == 'rss':
 226                 return GenericIE()._extract_rss(url, video_id, doc)
 227
 228         title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 229             [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
 230              r'<meta name="dcterms\.title" content="(.*?)"/>',
 231              r'<h4 class="headline">(.*?)</h4>',
 232              r'<title[^>]*>(.*?)</title>'],
 233             webpage, 'title')
 234         description = self._og_search_description(webpage, default=None) or self._html_search_meta(
 235             'dcterms.abstract', webpage, 'description', default=None)
 236         if description is None:
 237             description = self._html_search_meta(
 238                 'description', webpage, 'meta description', default=None)
 239         if description is None:
 240             description = self._html_search_regex(
 241                 r'<p\s+class="teasertext">(.+?)</p>',
 242                 webpage, 'teaser text', default=None)
 243
 244         # Thumbnail is sometimes not present.
 245         # It is in the mobile version, but that seems to use a different URL
 246         # structure altogether.
 247         thumbnail = self._og_search_thumbnail(webpage, default=None)
 248
 249         media_streams = re.findall(r'''(?x)
 250             mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
 251             "([^"]+)"''', webpage)
 252
 253         if media_streams:
 254             QUALITIES = qualities(['lo', 'hi', 'hq'])
 255             formats = []
 256             for furl in set(media_streams):
 257                 if furl.endswith('.f4m'):
 258                     fid = 'f4m'
 259                 else:
 260                     fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
 261                     fid = fid_m.group(1) if fid_m else None
 262                 formats.append({
 263                     'quality': QUALITIES(fid),
 264                     'format_id': fid,
 265                     'url': furl,
 266                 })
 267             info = {
 268                 'formats': formats,
 269             }
 270         else:  # request JSON file
 271             if not document_id:
 272                 video_id = self._search_regex(
 273                     (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
 274                     webpage, 'media id', default=None)
 275             info = self._extract_media_info(
 276                 'http://www.ardmediathek.de/play/media/%s' % video_id,
 277                 webpage, video_id)
 278
 279         info.update({
 280             'id': video_id,
 281             'title': title,
 282             'description': description,
 283             'thumbnail': thumbnail,
 284         })
 285         info.update(self._ARD_extract_episode_info(info['title']))
 286
 287         return info
 288
 289
 290 class ARDIE(InfoExtractor):
 291     _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
 292     _TESTS = [{
 293         # available till 7.12.2023
 294         'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
 295         'md5': '94812e6438488fb923c361a44469614b',
 296         'info_dict': {
 297             'id': 'maischberger-video-424',
 298             'display_id': 'maischberger-video-424',
 299             'ext': 'mp4',
 300             'duration': 4452.0,
 301             'title': 'maischberger am 07.12.2022',
 302             'upload_date': '20221207',
 303             'thumbnail': r're:^https?://.*\.jpg$',
 304         },
 305     }, {
 306         'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
 307         'only_matching': True,
 308     }, {
 309         'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
 310         'only_matching': True,
 311     }, {
 312         'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
 313         'only_matching': True,
 314     }, {
 315         'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
 316         'only_matching': True,
 317     }, {
 318         'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
 319         'only_matching': True,
 320     }, {
 321         'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
 322         'only_matching': True,
 323     }]
 324
 325     def _real_extract(self, url):
 326         mobj = self._match_valid_url(url)
 327         display_id = mobj.group('id')
 328
 329         player_url = mobj.group('mainurl') + '~playerXml.xml'
 330         doc = self._download_xml(player_url, display_id)
 331         video_node = doc.find('./video')
 332         upload_date = unified_strdate(xpath_text(
 333             video_node, './broadcastDate'))
 334         thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
 335
 336         formats = []
 337         for a in video_node.findall('.//asset'):
 338             file_name = xpath_text(a, './fileName', default=None)
 339             if not file_name:
 340                 continue
 341             format_type = a.attrib.get('type')
 342             format_url = url_or_none(file_name)
 343             if format_url:
 344                 ext = determine_ext(file_name)
 345                 if ext == 'm3u8':
 346                     formats.extend(self._extract_m3u8_formats(
 347                         format_url, display_id, 'mp4', entry_protocol='m3u8_native',
 348                         m3u8_id=format_type or 'hls', fatal=False))
 349                     continue
 350                 elif ext == 'f4m':
 351                     formats.extend(self._extract_f4m_formats(
 352                         update_url_query(format_url, {'hdcore': '3.7.0'}),
 353                         display_id, f4m_id=format_type or 'hds', fatal=False))
 354                     continue
 355             f = {
 356                 'format_id': format_type,
 357                 'width': int_or_none(xpath_text(a, './frameWidth')),
 358                 'height': int_or_none(xpath_text(a, './frameHeight')),
 359                 'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
 360                 'abr': int_or_none(xpath_text(a, './bitrateAudio')),
 361                 'vcodec': xpath_text(a, './codecVideo'),
 362                 'tbr': int_or_none(xpath_text(a, './totalBitrate')),
 363             }
 364             server_prefix = xpath_text(a, './serverPrefix', default=None)
 365             if server_prefix:
 366                 f.update({
 367                     'url': server_prefix,
 368                     'playpath': file_name,
 369                 })
 370             else:
 371                 if not format_url:
 372                     continue
 373                 f['url'] = format_url
 374             formats.append(f)
 375
 376         _SUB_FORMATS = (
 377             ('./dataTimedText', 'ttml'),
 378             ('./dataTimedTextNoOffset', 'ttml'),
 379             ('./dataTimedTextVtt', 'vtt'),
 380         )
 381
 382         subtitles = {}
 383         for subsel, subext in _SUB_FORMATS:
 384             for node in video_node.findall(subsel):
 385                 subtitles.setdefault('de', []).append({
 386                     'url': node.attrib['url'],
 387                     'ext': subext,
 388                 })
 389
 390         return {
 391             'id': xpath_text(video_node, './videoId', default=display_id),
 392             'formats': formats,
 393             'subtitles': subtitles,
 394             'display_id': display_id,
 395             'title': video_node.find('./title').text,
 396             'duration': parse_duration(video_node.find('./duration').text),
 397             'upload_date': upload_date,
 398             'thumbnail': thumbnail,
 399         }
 400
 401
 402 class ARDBetaMediathekIE(ARDMediathekBaseIE):
 403     _VALID_URL = r'''(?x)https://
 404         (?:(?:beta|www)\.)?ardmediathek\.de/
 405         (?:(?P<client>[^/]+)/)?
 406         (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
 407         (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
 408         (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
 409         (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
 410
 411     _TESTS = [{
 412         'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
 413         'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
 414         'info_dict': {
 415             'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
 416             'id': '12939099',
 417             'title': 'Liebe auf vier Pfoten',
 418             'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
 419             'duration': 5222,
 420             'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
 421             'timestamp': 1701343800,
 422             'upload_date': '20231130',
 423             'ext': 'mp4',
 424             'episode': 'Liebe auf vier Pfoten',
 425             'series': 'Filme im MDR'
 426         },
 427     }, {
 428         'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
 429         'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
 430         'info_dict': {
 431             'display_id': 'die-robuste-roswita',
 432             'id': '78566716',
 433             'title': 'Die robuste Roswita',
 434             'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita',
 435             'duration': 5316,
 436             'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard',
 437             'timestamp': 1596658200,
 438             'upload_date': '20200805',
 439             'ext': 'mp4',
 440         },
 441         'skip': 'Error',
 442     }, {
 443         'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
 444         'md5': '1e73ded21cb79bac065117e80c81dc88',
 445         'info_dict': {
 446             'id': '10049223',
 447             'ext': 'mp4',
 448             'title': 'tagesschau, 20:00 Uhr',
 449             'timestamp': 1636398000,
 450             'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
 451             'upload_date': '20211108',
 452             'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste',
 453             'duration': 915,
 454             'episode': 'tagesschau, 20:00 Uhr',
 455             'series': 'tagesschau',
 456             'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
 457         },
 458     }, {
 459         'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
 460         'only_matching': True,
 461     }, {
 462         'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
 463         'only_matching': True,
 464     }, {
 465         'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
 466         'only_matching': True,
 467     }, {
 468         'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
 469         'only_matching': True,
 470     }, {
 471         'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
 472         'only_matching': True,
 473     }, {
 474         # playlist of type 'sendung'
 475         'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
 476         'only_matching': True,
 477     }, {
 478         # playlist of type 'serie'
 479         'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
 480         'only_matching': True,
 481     }, {
 482         # playlist of type 'sammlung'
 483         'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
 484         'only_matching': True,
 485     }, {
 486         'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
 487         'only_matching': True,
 488     }, {
 489         'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
 490         'only_matching': True,
 491     }]
 492
 493     def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
 494         """ Query the ARD server for playlist information
 495         and returns the data in "raw" format """
 496         assert mode in ('sendung', 'serie', 'sammlung')
 497         if mode in ('sendung', 'serie'):
 498             graphQL = json.dumps({
 499                 'query': '''{
 500                     showPage(
 501                         client: "%s"
 502                         showId: "%s"
 503                         pageNumber: %d
 504                     ) {
 505                         pagination {
 506                             pageSize
 507                             totalElements
 508                         }
 509                         teasers {        # Array
 510                             mediumTitle
 511                             links { target { id href title } }
 512                             type
 513                         }
 514                     }}''' % (client, playlist_id, page_number),
 515             }).encode()
 516         else:  # mode == 'sammlung'
 517             graphQL = json.dumps({
 518                 'query': '''{
 519                     morePage(
 520                         client: "%s"
 521                         compilationId: "%s"
 522                         pageNumber: %d
 523                     ) {
 524                         widget {
 525                             pagination {
 526                                 pageSize
 527                                 totalElements
 528                             }
 529                             teasers {        # Array
 530                                 mediumTitle
 531                                 links { target { id href title } }
 532                                 type
 533                             }
 534                         }
 535                     }}''' % (client, playlist_id, page_number),
 536             }).encode()
 537         # Ressources for ARD graphQL debugging:
 538         # https://api-test.ardmediathek.de/public-gateway
 539         show_page = self._download_json(
 540             'https://api.ardmediathek.de/public-gateway',
 541             '[Playlist] %s' % display_id,
 542             data=graphQL,
 543             headers={'Content-Type': 'application/json'})['data']
 544         # align the structure of the returned data:
 545         if mode in ('sendung', 'serie'):
 546             show_page = show_page['showPage']
 547         else:  # mode == 'sammlung'
 548             show_page = show_page['morePage']['widget']
 549         return show_page
 550
 551     def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
 552         """ Collects all playlist entries and returns them as info dict.
 553         Supports playlists of mode 'sendung', 'serie', and 'sammlung',
 554         as well as nested playlists. """
 555         entries = []
 556         pageNumber = 0
 557         while True:  # iterate by pageNumber
 558             show_page = self._ARD_load_playlist_snippet(
 559                 playlist_id, display_id, client, mode, pageNumber)
 560             for teaser in show_page['teasers']:  # process playlist items
 561                 if '/compilation/' in teaser['links']['target']['href']:
 562                     # alternativ cond.: teaser['type'] == "compilation"
 563                     # => This is an nested compilation, e.g. like:
 564                     # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
 565                     link_mode = 'sammlung'
 566                 else:
 567                     link_mode = 'video'
 568
 569                 item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
 570                     client, link_mode, display_id,
 571                     # perform HTLM quoting of episode title similar to ARD:
 572                     re.sub('^-|-$', '',  # remove '-' from begin/end
 573                            re.sub('[^a-zA-Z0-9]+', '-',  # replace special chars by -
 574                                   teaser['links']['target']['title'].lower()
 575                                   .replace('ä', 'ae').replace('ö', 'oe')
 576                                   .replace('ü', 'ue').replace('ß', 'ss'))),
 577                     teaser['links']['target']['id'])
 578                 entries.append(self.url_result(
 579                     item_url,
 580                     ie=ARDBetaMediathekIE.ie_key()))
 581
 582             if (show_page['pagination']['pageSize'] * (pageNumber + 1)
 583                >= show_page['pagination']['totalElements']):
 584                 # we've processed enough pages to get all playlist entries
 585                 break
 586             pageNumber = pageNumber + 1
 587
 588         return self.playlist_result(entries, playlist_id, playlist_title=display_id)
 589
 590     def _real_extract(self, url):
 591         video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
 592             'id', 'display_id', 'playlist', 'client', 'season')
 593         display_id, client = display_id or video_id, client or 'ard'
 594
 595         if playlist_type:
 596             # TODO: Extract only specified season
 597             return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
 598
 599         player_page = self._download_json(
 600             'https://api.ardmediathek.de/public-gateway',
 601             display_id, data=json.dumps({
 602                 'query': '''{
 603   playerPage(client:"%s", clipId: "%s") {
 604     blockedByFsk
 605     broadcastedOn
 606     maturityContentRating
 607     mediaCollection {
 608       _duration
 609       _geoblocked
 610       _isLive
 611       _mediaArray {
 612         _mediaStreamArray {
 613           _quality
 614           _server
 615           _stream
 616         }
 617       }
 618       _previewImage
 619       _subtitleUrl
 620       _type
 621     }
 622     show {
 623       title
 624     }
 625     image {
 626       src
 627     }
 628     synopsis
 629     title
 630     tracking {
 631       atiCustomVars {
 632         contentId
 633       }
 634     }
 635   }
 636 }''' % (client, video_id),
 637             }).encode(), headers={
 638                 'Content-Type': 'application/json'
 639             })['data']['playerPage']
 640         title = player_page['title']
 641         content_id = str_or_none(try_get(
 642             player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
 643         media_collection = player_page.get('mediaCollection') or {}
 644         if not media_collection and content_id:
 645             media_collection = self._download_json(
 646                 'https://www.ardmediathek.de/play/media/' + content_id,
 647                 content_id, fatal=False) or {}
 648         info = self._parse_media_info(
 649             media_collection, content_id or video_id,
 650             player_page.get('blockedByFsk'))
 651         age_limit = None
 652         description = player_page.get('synopsis')
 653         maturity_content_rating = player_page.get('maturityContentRating')
 654         if maturity_content_rating:
 655             age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
 656         if not age_limit and description:
 657             age_limit = int_or_none(self._search_regex(
 658                 r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
 659         info.update({
 660             'age_limit': age_limit,
 661             'display_id': display_id,
 662             'title': title,
 663             'description': description,
 664             'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
 665             'series': try_get(player_page, lambda x: x['show']['title']),
 666             'thumbnail': (media_collection.get('_previewImage')
 667                           or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
 668                           or self.get_thumbnail_from_html(display_id, url)),
 669         })
 670         info.update(self._ARD_extract_episode_info(info['title']))
 671         return info
 672
 673     def get_thumbnail_from_html(self, display_id, url):
 674         webpage = self._download_webpage(url, display_id, fatal=False) or ''
 675         return (
 676             self._og_search_thumbnail(webpage, default=None)
 677             or self._html_search_meta('thumbnailUrl', webpage, default=None))