yt_dlp/extractor/zdf.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_str
   8 from ..utils import (
   9     determine_ext,
  10     float_or_none,
  11     int_or_none,
  12     join_nonempty,
  13     merge_dicts,
  14     NO_DEFAULT,
  15     orderedSet,
  16     parse_codecs,
  17     qualities,
  18     try_get,
  19     unified_timestamp,
  20     update_url_query,
  21     url_or_none,
  22     urljoin,
  23 )
  24
  25
  26 class ZDFBaseIE(InfoExtractor):
  27     _GEO_COUNTRIES = ['DE']
  28     _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd')
  29
  30     def _call_api(self, url, video_id, item, api_token=None, referrer=None):
  31         headers = {}
  32         if api_token:
  33             headers['Api-Auth'] = 'Bearer %s' % api_token
  34         if referrer:
  35             headers['Referer'] = referrer
  36         return self._download_json(
  37             url, video_id, 'Downloading JSON %s' % item, headers=headers)
  38
  39     @staticmethod
  40     def _extract_subtitles(src):
  41         subtitles = {}
  42         for caption in try_get(src, lambda x: x['captions'], list) or []:
  43             subtitle_url = url_or_none(caption.get('uri'))
  44             if subtitle_url:
  45                 lang = caption.get('language', 'deu')
  46                 subtitles.setdefault(lang, []).append({
  47                     'url': subtitle_url,
  48                 })
  49         return subtitles
  50
  51     def _extract_format(self, video_id, formats, format_urls, meta):
  52         format_url = url_or_none(meta.get('url'))
  53         if not format_url or format_url in format_urls:
  54             return
  55         format_urls.add(format_url)
  56
  57         mime_type, ext = meta.get('mimeType'), determine_ext(format_url)
  58         if mime_type == 'application/x-mpegURL' or ext == 'm3u8':
  59             new_formats = self._extract_m3u8_formats(
  60                 format_url, video_id, 'mp4', m3u8_id='hls',
  61                 entry_protocol='m3u8_native', fatal=False)
  62         elif mime_type == 'application/f4m+xml' or ext == 'f4m':
  63             new_formats = self._extract_f4m_formats(
  64                 update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)
  65         else:
  66             f = parse_codecs(meta.get('mimeCodec'))
  67             if not f and meta.get('type'):
  68                 data = meta['type'].split('_')
  69                 if try_get(data, lambda x: x[2]) == ext:
  70                     f = {'vcodec': data[0], 'acodec': data[1]}
  71             f.update({
  72                 'url': format_url,
  73                 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')),
  74             })
  75             new_formats = [f]
  76         formats.extend(merge_dicts(f, {
  77             'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '),
  78             'language': meta.get('language'),
  79             'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1,
  80             'quality': qualities(self._QUALITIES)(meta.get('quality')),
  81         }) for f in new_formats)
  82
  83     def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer):
  84         ptmd = self._call_api(
  85             ptmd_url, video_id, 'metadata', api_token, referrer)
  86
  87         content_id = ptmd.get('basename') or ptmd_url.split('/')[-1]
  88
  89         formats = []
  90         track_uris = set()
  91         for p in ptmd['priorityList']:
  92             formitaeten = p.get('formitaeten')
  93             if not isinstance(formitaeten, list):
  94                 continue
  95             for f in formitaeten:
  96                 f_qualities = f.get('qualities')
  97                 if not isinstance(f_qualities, list):
  98                     continue
  99                 for quality in f_qualities:
 100                     tracks = try_get(quality, lambda x: x['audio']['tracks'], list)
 101                     if not tracks:
 102                         continue
 103                     for track in tracks:
 104                         self._extract_format(
 105                             content_id, formats, track_uris, {
 106                                 'url': track.get('uri'),
 107                                 'type': f.get('type'),
 108                                 'mimeType': f.get('mimeType'),
 109                                 'quality': quality.get('quality'),
 110                                 'class': track.get('class'),
 111                                 'language': track.get('language'),
 112                             })
 113         self._sort_formats(formats, ('hasaud', 'res', 'quality', 'language_preference'))
 114
 115         duration = float_or_none(try_get(
 116             ptmd, lambda x: x['attributes']['duration']['value']), scale=1000)
 117
 118         return {
 119             'extractor_key': ZDFIE.ie_key(),
 120             'id': content_id,
 121             'duration': duration,
 122             'formats': formats,
 123             'subtitles': self._extract_subtitles(ptmd),
 124         }
 125
 126     def _extract_player(self, webpage, video_id, fatal=True):
 127         return self._parse_json(
 128             self._search_regex(
 129                 r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage,
 130                 'player JSON', default='{}' if not fatal else NO_DEFAULT,
 131                 group='json'),
 132             video_id)
 133
 134
 135 class ZDFIE(ZDFBaseIE):
 136     _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
 137     _TESTS = [{
 138         # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
 139         'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
 140         'md5': '34ec321e7eb34231fd88616c65c92db0',
 141         'info_dict': {
 142             'id': '210222_phx_nachgehakt_corona_protest',
 143             'ext': 'mp4',
 144             'title': 'Wohin führt der Protest in der Pandemie?',
 145             'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
 146             'duration': 1691,
 147             'timestamp': 1613948400,
 148             'upload_date': '20210221',
 149         },
 150     }, {
 151         # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
 152         'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
 153         'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
 154         'info_dict': {
 155             'id': '141007_ab18_10wochensommer_film',
 156             'ext': 'mp4',
 157             'title': 'Ab 18! - 10 Wochen Sommer',
 158             'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
 159             'duration': 2660,
 160             'timestamp': 1608604200,
 161             'upload_date': '20201222',
 162         },
 163     }, {
 164         'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
 165         'info_dict': {
 166             'id': '151025_magie_farben2_tex',
 167             'ext': 'mp4',
 168             'title': 'Die Magie der Farben (2/2)',
 169             'description': 'md5:a89da10c928c6235401066b60a6d5c1a',
 170             'duration': 2615,
 171             'timestamp': 1465021200,
 172             'upload_date': '20160604',
 173         },
 174     }, {
 175         # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
 176         'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html',
 177         'only_matching': True,
 178     }, {
 179         # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html
 180         'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html',
 181         'only_matching': True,
 182     }, {
 183         # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids
 184         'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html',
 185         'only_matching': True,
 186     }, {
 187         'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html',
 188         'only_matching': True,
 189     }, {
 190         'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html',
 191         'only_matching': True,
 192     }, {
 193         'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
 194         'only_matching': True,
 195     }]
 196
 197     def _extract_entry(self, url, player, content, video_id):
 198         title = content.get('title') or content['teaserHeadline']
 199
 200         t = content['mainVideoContent']['http://zdf.de/rels/target']
 201
 202         ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
 203
 204         if not ptmd_path:
 205             ptmd_path = t[
 206                 'http://zdf.de/rels/streams/ptmd-template'].replace(
 207                 '{playerId}', 'ngplayer_2_4')
 208
 209         info = self._extract_ptmd(
 210             urljoin(url, ptmd_path), video_id, player['apiToken'], url)
 211
 212         thumbnails = []
 213         layouts = try_get(
 214             content, lambda x: x['teaserImageRef']['layouts'], dict)
 215         if layouts:
 216             for layout_key, layout_url in layouts.items():
 217                 layout_url = url_or_none(layout_url)
 218                 if not layout_url:
 219                     continue
 220                 thumbnail = {
 221                     'url': layout_url,
 222                     'format_id': layout_key,
 223                 }
 224                 mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key)
 225                 if mobj:
 226                     thumbnail.update({
 227                         'width': int(mobj.group('width')),
 228                         'height': int(mobj.group('height')),
 229                     })
 230                 thumbnails.append(thumbnail)
 231
 232         return merge_dicts(info, {
 233             'title': title,
 234             'description': content.get('leadParagraph') or content.get('teasertext'),
 235             'duration': int_or_none(t.get('duration')),
 236             'timestamp': unified_timestamp(content.get('editorialDate')),
 237             'thumbnails': thumbnails,
 238         })
 239
 240     def _extract_regular(self, url, player, video_id):
 241         content = self._call_api(
 242             player['content'], video_id, 'content', player['apiToken'], url)
 243         return self._extract_entry(player['content'], player, content, video_id)
 244
 245     def _extract_mobile(self, video_id):
 246         video = self._download_json(
 247             'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id,
 248             video_id)
 249
 250         document = video['document']
 251
 252         title = document['titel']
 253         content_id = document['basename']
 254
 255         formats = []
 256         format_urls = set()
 257         for f in document['formitaeten']:
 258             self._extract_format(content_id, formats, format_urls, f)
 259         self._sort_formats(formats)
 260
 261         thumbnails = []
 262         teaser_bild = document.get('teaserBild')
 263         if isinstance(teaser_bild, dict):
 264             for thumbnail_key, thumbnail in teaser_bild.items():
 265                 thumbnail_url = try_get(
 266                     thumbnail, lambda x: x['url'], compat_str)
 267                 if thumbnail_url:
 268                     thumbnails.append({
 269                         'url': thumbnail_url,
 270                         'id': thumbnail_key,
 271                         'width': int_or_none(thumbnail.get('width')),
 272                         'height': int_or_none(thumbnail.get('height')),
 273                     })
 274
 275         return {
 276             'id': content_id,
 277             'title': title,
 278             'description': document.get('beschreibung'),
 279             'duration': int_or_none(document.get('length')),
 280             'timestamp': unified_timestamp(document.get('date')) or unified_timestamp(
 281                 try_get(video, lambda x: x['meta']['editorialDate'], compat_str)),
 282             'thumbnails': thumbnails,
 283             'subtitles': self._extract_subtitles(document),
 284             'formats': formats,
 285         }
 286
 287     def _real_extract(self, url):
 288         video_id = self._match_id(url)
 289
 290         webpage = self._download_webpage(url, video_id, fatal=False)
 291         if webpage:
 292             player = self._extract_player(webpage, url, fatal=False)
 293             if player:
 294                 return self._extract_regular(url, player, video_id)
 295
 296         return self._extract_mobile(video_id)
 297
 298
 299 class ZDFChannelIE(ZDFBaseIE):
 300     _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 301     _TESTS = [{
 302         'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio',
 303         'info_dict': {
 304             'id': 'das-aktuelle-sportstudio',
 305             'title': 'das aktuelle sportstudio | ZDF',
 306         },
 307         'playlist_mincount': 23,
 308     }, {
 309         'url': 'https://www.zdf.de/dokumentation/planet-e',
 310         'info_dict': {
 311             'id': 'planet-e',
 312             'title': 'planet e.',
 313         },
 314         'playlist_mincount': 50,
 315     }, {
 316         'url': 'https://www.zdf.de/filme/taunuskrimi/',
 317         'only_matching': True,
 318     }]
 319
 320     @classmethod
 321     def suitable(cls, url):
 322         return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url)
 323
 324     def _real_extract(self, url):
 325         channel_id = self._match_id(url)
 326
 327         webpage = self._download_webpage(url, channel_id)
 328
 329         entries = [
 330             self.url_result(item_url, ie=ZDFIE.ie_key())
 331             for item_url in orderedSet(re.findall(
 332                 r'data-plusbar-url=["\'](http.+?\.html)', webpage))]
 333
 334         return self.playlist_result(
 335             entries, channel_id, self._og_search_title(webpage, fatal=False))
 336
 337         r"""
 338         player = self._extract_player(webpage, channel_id)
 339
 340         channel_id = self._search_regex(
 341             r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage,
 342             'channel id', group='id')
 343
 344         channel = self._call_api(
 345             'https://api.zdf.de/content/documents/%s.json' % channel_id,
 346             player, url, channel_id)
 347
 348         items = []
 349         for module in channel['module']:
 350             for teaser in try_get(module, lambda x: x['teaser'], list) or []:
 351                 t = try_get(
 352                     teaser, lambda x: x['http://zdf.de/rels/target'], dict)
 353                 if not t:
 354                     continue
 355                 items.extend(try_get(
 356                     t,
 357                     lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'],
 358                     list) or [])
 359             items.extend(try_get(
 360                 module,
 361                 lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'],
 362                 list) or [])
 363
 364         entries = []
 365         entry_urls = set()
 366         for item in items:
 367             t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict)
 368             if not t:
 369                 continue
 370             sharing_url = t.get('http://zdf.de/rels/sharing-url')
 371             if not sharing_url or not isinstance(sharing_url, compat_str):
 372                 continue
 373             if sharing_url in entry_urls:
 374                 continue
 375             entry_urls.add(sharing_url)
 376             entries.append(self.url_result(
 377                 sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id')))
 378
 379         return self.playlist_result(entries, channel_id, channel.get('title'))
 380         """