yt_dlp/extractor/zdf.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     NO_DEFAULT,
   6     ExtractorError,
   7     determine_ext,
   8     extract_attributes,
   9     float_or_none,
  10     int_or_none,
  11     join_nonempty,
  12     merge_dicts,
  13     parse_codecs,
  14     qualities,
  15     traverse_obj,
  16     try_get,
  17     unified_timestamp,
  18     update_url_query,
  19     url_or_none,
  20     urljoin,
  21 )
  22
  23
  24 class ZDFBaseIE(InfoExtractor):
  25     _GEO_COUNTRIES = ['DE']
  26     _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd')
  27
  28     def _call_api(self, url, video_id, item, api_token=None, referrer=None):
  29         headers = {}
  30         if api_token:
  31             headers['Api-Auth'] = f'Bearer {api_token}'
  32         if referrer:
  33             headers['Referer'] = referrer
  34         return self._download_json(
  35             url, video_id, f'Downloading JSON {item}', headers=headers)
  36
  37     @staticmethod
  38     def _extract_subtitles(src):
  39         subtitles = {}
  40         for caption in try_get(src, lambda x: x['captions'], list) or []:
  41             subtitle_url = url_or_none(caption.get('uri'))
  42             if subtitle_url:
  43                 lang = caption.get('language', 'deu')
  44                 subtitles.setdefault(lang, []).append({
  45                     'url': subtitle_url,
  46                 })
  47         return subtitles
  48
  49     def _extract_format(self, video_id, formats, format_urls, meta):
  50         format_url = url_or_none(meta.get('url'))
  51         if not format_url or format_url in format_urls:
  52             return
  53         format_urls.add(format_url)
  54
  55         mime_type, ext = meta.get('mimeType'), determine_ext(format_url)
  56         if mime_type == 'application/x-mpegURL' or ext == 'm3u8':
  57             new_formats = self._extract_m3u8_formats(
  58                 format_url, video_id, 'mp4', m3u8_id='hls',
  59                 entry_protocol='m3u8_native', fatal=False)
  60         elif mime_type == 'application/f4m+xml' or ext == 'f4m':
  61             new_formats = self._extract_f4m_formats(
  62                 update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)
  63         elif ext == 'mpd':
  64             new_formats = self._extract_mpd_formats(
  65                 format_url, video_id, mpd_id='dash', fatal=False)
  66         else:
  67             f = parse_codecs(meta.get('mimeCodec'))
  68             if not f and meta.get('type'):
  69                 data = meta['type'].split('_')
  70                 if try_get(data, lambda x: x[2]) == ext:
  71                     f = {'vcodec': data[0], 'acodec': data[1]}
  72             f.update({
  73                 'url': format_url,
  74                 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')),
  75                 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)),
  76             })
  77             new_formats = [f]
  78         formats.extend(merge_dicts(f, {
  79             'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '),
  80             'language': meta.get('language'),
  81             'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1,
  82             'quality': qualities(self._QUALITIES)(meta.get('quality')),
  83         }) for f in new_formats)
  84
  85     def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer):
  86         ptmd = self._call_api(
  87             ptmd_url, video_id, 'metadata', api_token, referrer)
  88
  89         content_id = ptmd.get('basename') or ptmd_url.split('/')[-1]
  90
  91         formats = []
  92         track_uris = set()
  93         for p in ptmd['priorityList']:
  94             formitaeten = p.get('formitaeten')
  95             if not isinstance(formitaeten, list):
  96                 continue
  97             for f in formitaeten:
  98                 f_qualities = f.get('qualities')
  99                 if not isinstance(f_qualities, list):
 100                     continue
 101                 for quality in f_qualities:
 102                     tracks = try_get(quality, lambda x: x['audio']['tracks'], list)
 103                     if not tracks:
 104                         continue
 105                     for track in tracks:
 106                         self._extract_format(
 107                             content_id, formats, track_uris, {
 108                                 'url': track.get('uri'),
 109                                 'type': f.get('type'),
 110                                 'mimeType': f.get('mimeType'),
 111                                 'quality': quality.get('quality'),
 112                                 'class': track.get('class'),
 113                                 'language': track.get('language'),
 114                             })
 115
 116         duration = float_or_none(try_get(
 117             ptmd, lambda x: x['attributes']['duration']['value']), scale=1000)
 118
 119         return {
 120             'extractor_key': ZDFIE.ie_key(),
 121             'id': content_id,
 122             'duration': duration,
 123             'formats': formats,
 124             'subtitles': self._extract_subtitles(ptmd),
 125             '_format_sort_fields': ('tbr', 'res', 'quality', 'language_preference'),
 126         }
 127
 128     def _extract_player(self, webpage, video_id, fatal=True):
 129         return self._parse_json(
 130             self._search_regex(
 131                 r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage,
 132                 'player JSON', default='{}' if not fatal else NO_DEFAULT,
 133                 group='json'),
 134             video_id)
 135
 136
 137 class ZDFIE(ZDFBaseIE):
 138     _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
 139     _TESTS = [{
 140         # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
 141         'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
 142         'md5': '34ec321e7eb34231fd88616c65c92db0',
 143         'info_dict': {
 144             'id': '210222_phx_nachgehakt_corona_protest',
 145             'ext': 'mp4',
 146             'title': 'Wohin führt der Protest in der Pandemie?',
 147             'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
 148             'duration': 1691,
 149             'timestamp': 1613948400,
 150             'upload_date': '20210221',
 151         },
 152         'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
 153     }, {
 154         # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
 155         'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
 156         'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
 157         'info_dict': {
 158             'id': '141007_ab18_10wochensommer_film',
 159             'ext': 'mp4',
 160             'title': 'Ab 18! - 10 Wochen Sommer',
 161             'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
 162             'duration': 2660,
 163             'timestamp': 1608604200,
 164             'upload_date': '20201222',
 165         },
 166         'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
 167     }, {
 168         'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html',
 169         'info_dict': {
 170             'id': '211230_sendung_hjo',
 171             'ext': 'mp4',
 172             'description': 'md5:47dff85977bde9fb8cba9e9c9b929839',
 173             'duration': 1890.0,
 174             'upload_date': '20211230',
 175             'chapters': list,
 176             'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e',
 177             'title': 'heute journal vom 30.12.2021',
 178             'timestamp': 1640897100,
 179         },
 180         'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
 181     }, {
 182         'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
 183         'info_dict': {
 184             'id': '151025_magie_farben2_tex',
 185             'ext': 'mp4',
 186             'title': 'Die Magie der Farben (2/2)',
 187             'description': 'md5:a89da10c928c6235401066b60a6d5c1a',
 188             'duration': 2615,
 189             'timestamp': 1465021200,
 190             'upload_date': '20160604',
 191             'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806',
 192         },
 193     }, {
 194         'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
 195         'md5': '57af4423db0455a3975d2dc4578536bc',
 196         'info_dict': {
 197             'ext': 'mp4',
 198             'id': 'video_funk_1770473',
 199             'duration': 1278,
 200             'description': 'Die Neue an der Schule verdreht Ismail den Kopf.',
 201             'title': 'Alles ist verzaubert',
 202             'timestamp': 1635520560,
 203             'upload_date': '20211029',
 204             'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~1920x1080?cb=1663848412907',
 205         },
 206     }, {
 207         # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
 208         'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html',
 209         'only_matching': True,
 210     }, {
 211         # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html
 212         'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html',
 213         'only_matching': True,
 214     }, {
 215         # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids
 216         'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html',
 217         'only_matching': True,
 218     }, {
 219         'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html',
 220         'only_matching': True,
 221     }, {
 222         'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html',
 223         'only_matching': True,
 224     }, {
 225         'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
 226         'only_matching': True,
 227     }, {
 228         'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html',
 229         'info_dict': {
 230             'id': 'video_artede_083871-001-A',
 231             'ext': 'mp4',
 232             'title': 'Tödliche Flucht (1/6)',
 233             'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315',
 234             'duration': 3193.0,
 235             'timestamp': 1641355200,
 236             'upload_date': '20220105',
 237         },
 238         'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"',
 239     }, {
 240         'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html',
 241         'info_dict': {
 242             'id': '191205_1800_sendung_sok8',
 243             'ext': 'mp4',
 244             'title': 'Das Geld anderer Leute',
 245             'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d',
 246             'duration': 2581.0,
 247             'timestamp': 1675160100,
 248             'upload_date': '20230131',
 249             'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350',
 250         },
 251     }, {
 252         'url': 'https://www.zdf.de/dokumentation/terra-x/unser-gruener-planet-wuesten-doku-100.html',
 253         'info_dict': {
 254             'id': '220605_dk_gruener_planet_wuesten_tex',
 255             'ext': 'mp4',
 256             'title': 'Unser grüner Planet - Wüsten',
 257             'description': 'md5:4fc647b6f9c3796eea66f4a0baea2862',
 258             'duration': 2613.0,
 259             'timestamp': 1654450200,
 260             'upload_date': '20220605',
 261             'format_note': 'uhd, main',
 262             'thumbnail': 'https://www.zdf.de/assets/saguaro-kakteen-102~3840x2160?cb=1655910690796',
 263         },
 264     }]
 265
 266     def _extract_entry(self, url, player, content, video_id):
 267         title = content.get('title') or content['teaserHeadline']
 268
 269         t = content['mainVideoContent']['http://zdf.de/rels/target']
 270         ptmd_path = traverse_obj(t, (
 271             (('streams', 'default'), None),
 272             ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'),
 273         ), get_all=False)
 274         if not ptmd_path:
 275             raise ExtractorError('Could not extract ptmd_path')
 276
 277         info = self._extract_ptmd(
 278             urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url)
 279
 280         thumbnails = []
 281         layouts = try_get(
 282             content, lambda x: x['teaserImageRef']['layouts'], dict)
 283         if layouts:
 284             for layout_key, layout_url in layouts.items():
 285                 layout_url = url_or_none(layout_url)
 286                 if not layout_url:
 287                     continue
 288                 thumbnail = {
 289                     'url': layout_url,
 290                     'format_id': layout_key,
 291                 }
 292                 mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key)
 293                 if mobj:
 294                     thumbnail.update({
 295                         'width': int(mobj.group('width')),
 296                         'height': int(mobj.group('height')),
 297                     })
 298                 thumbnails.append(thumbnail)
 299
 300         chapter_marks = t.get('streamAnchorTag') or []
 301         chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))})
 302         chapters = [{
 303             'start_time': chap.get('anchorOffset'),
 304             'end_time': next_chap.get('anchorOffset'),
 305             'title': chap.get('anchorLabel'),
 306         } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])]
 307
 308         return merge_dicts(info, {
 309             'title': title,
 310             'description': content.get('leadParagraph') or content.get('teasertext'),
 311             'duration': int_or_none(t.get('duration')),
 312             'timestamp': unified_timestamp(content.get('editorialDate')),
 313             'thumbnails': thumbnails,
 314             'chapters': chapters or None,
 315         })
 316
 317     def _extract_regular(self, url, player, video_id):
 318         content = self._call_api(
 319             player['content'], video_id, 'content', player['apiToken'], url)
 320         return self._extract_entry(player['content'], player, content, video_id)
 321
 322     def _extract_mobile(self, video_id):
 323         video = self._download_json(
 324             f'https://zdf-cdn.live.cellular.de/mediathekV2/document/{video_id}',
 325             video_id)
 326
 327         formats = []
 328         formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list)
 329         document = formitaeten and video['document']
 330         if formitaeten:
 331             title = document['titel']
 332             content_id = document['basename']
 333
 334             format_urls = set()
 335             for f in formitaeten or []:
 336                 self._extract_format(content_id, formats, format_urls, f)
 337
 338         thumbnails = []
 339         teaser_bild = document.get('teaserBild')
 340         if isinstance(teaser_bild, dict):
 341             for thumbnail_key, thumbnail in teaser_bild.items():
 342                 thumbnail_url = try_get(
 343                     thumbnail, lambda x: x['url'], str)
 344                 if thumbnail_url:
 345                     thumbnails.append({
 346                         'url': thumbnail_url,
 347                         'id': thumbnail_key,
 348                         'width': int_or_none(thumbnail.get('width')),
 349                         'height': int_or_none(thumbnail.get('height')),
 350                     })
 351
 352         return {
 353             'id': content_id,
 354             'title': title,
 355             'description': document.get('beschreibung'),
 356             'duration': int_or_none(document.get('length')),
 357             'timestamp': unified_timestamp(document.get('date')) or unified_timestamp(
 358                 try_get(video, lambda x: x['meta']['editorialDate'], str)),
 359             'thumbnails': thumbnails,
 360             'subtitles': self._extract_subtitles(document),
 361             'formats': formats,
 362         }
 363
 364     def _real_extract(self, url):
 365         video_id = self._match_id(url)
 366
 367         webpage = self._download_webpage(url, video_id, fatal=False)
 368         if webpage:
 369             player = self._extract_player(webpage, url, fatal=False)
 370             if player:
 371                 return self._extract_regular(url, player, video_id)
 372
 373         return self._extract_mobile(video_id)
 374
 375
 376 class ZDFChannelIE(ZDFBaseIE):
 377     _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 378     _TESTS = [{
 379         'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio',
 380         'info_dict': {
 381             'id': 'das-aktuelle-sportstudio',
 382             'title': 'das aktuelle sportstudio',
 383         },
 384         'playlist_mincount': 18,
 385     }, {
 386         'url': 'https://www.zdf.de/dokumentation/planet-e',
 387         'info_dict': {
 388             'id': 'planet-e',
 389             'title': 'planet e.',
 390         },
 391         'playlist_mincount': 50,
 392     }, {
 393         'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest',
 394         'info_dict': {
 395             'id': 'aktenzeichen-xy-ungeloest',
 396             'title': 'Aktenzeichen XY... ungelöst',
 397             'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)",
 398         },
 399         'playlist_mincount': 2,
 400     }, {
 401         'url': 'https://www.zdf.de/filme/taunuskrimi/',
 402         'only_matching': True,
 403     }]
 404
 405     @classmethod
 406     def suitable(cls, url):
 407         return False if ZDFIE.suitable(url) else super().suitable(url)
 408
 409     def _og_search_title(self, webpage, fatal=False):
 410         title = super()._og_search_title(webpage, fatal=fatal)
 411         return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None
 412
 413     def _real_extract(self, url):
 414         channel_id = self._match_id(url)
 415
 416         webpage = self._download_webpage(url, channel_id)
 417
 418         matches = re.finditer(
 419             rf'''<div\b[^>]*?\sdata-plusbar-id\s*=\s*(["'])(?P<p_id>[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P<url>{ZDFIE._VALID_URL})\1''',
 420             webpage)
 421
 422         if self._downloader.params.get('noplaylist', False):
 423             entry = next(
 424                 (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches),
 425                 None)
 426             self.to_screen('Downloading just the main video because of --no-playlist')
 427             if entry:
 428                 return entry
 429         else:
 430             self.to_screen(f'Downloading playlist {channel_id} - add --no-playlist to download just the main video')
 431
 432         def check_video(m):
 433             v_ref = self._search_regex(
 434                 r'''(<a\b[^>]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["']){}\2[^>]*>)'''.format(m.group('p_id')),
 435                 webpage, 'check id', default='')
 436             v_ref = extract_attributes(v_ref)
 437             return v_ref.get('data-target-video-type') != 'novideo'
 438
 439         return self.playlist_from_matches(
 440             (m.group('url') for m in matches if check_video(m)),
 441             channel_id, self._og_search_title(webpage, fatal=False))