]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/zdf.py
[extractor/huya] Fix stream extraction (#4798)
[yt-dlp.git] / yt_dlp / extractor / zdf.py
1 import re
2
3 from .common import InfoExtractor
4 from ..compat import compat_str
5 from ..utils import (
6 determine_ext,
7 float_or_none,
8 int_or_none,
9 join_nonempty,
10 merge_dicts,
11 NO_DEFAULT,
12 orderedSet,
13 parse_codecs,
14 qualities,
15 traverse_obj,
16 try_get,
17 unified_timestamp,
18 update_url_query,
19 url_or_none,
20 urljoin,
21 )
22
23
24 class ZDFBaseIE(InfoExtractor):
25 _GEO_COUNTRIES = ['DE']
26 _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd')
27
28 def _call_api(self, url, video_id, item, api_token=None, referrer=None):
29 headers = {}
30 if api_token:
31 headers['Api-Auth'] = 'Bearer %s' % api_token
32 if referrer:
33 headers['Referer'] = referrer
34 return self._download_json(
35 url, video_id, 'Downloading JSON %s' % item, headers=headers)
36
37 @staticmethod
38 def _extract_subtitles(src):
39 subtitles = {}
40 for caption in try_get(src, lambda x: x['captions'], list) or []:
41 subtitle_url = url_or_none(caption.get('uri'))
42 if subtitle_url:
43 lang = caption.get('language', 'deu')
44 subtitles.setdefault(lang, []).append({
45 'url': subtitle_url,
46 })
47 return subtitles
48
49 def _extract_format(self, video_id, formats, format_urls, meta):
50 format_url = url_or_none(meta.get('url'))
51 if not format_url or format_url in format_urls:
52 return
53 format_urls.add(format_url)
54
55 mime_type, ext = meta.get('mimeType'), determine_ext(format_url)
56 if mime_type == 'application/x-mpegURL' or ext == 'm3u8':
57 new_formats = self._extract_m3u8_formats(
58 format_url, video_id, 'mp4', m3u8_id='hls',
59 entry_protocol='m3u8_native', fatal=False)
60 elif mime_type == 'application/f4m+xml' or ext == 'f4m':
61 new_formats = self._extract_f4m_formats(
62 update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)
63 else:
64 f = parse_codecs(meta.get('mimeCodec'))
65 if not f and meta.get('type'):
66 data = meta['type'].split('_')
67 if try_get(data, lambda x: x[2]) == ext:
68 f = {'vcodec': data[0], 'acodec': data[1]}
69 f.update({
70 'url': format_url,
71 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')),
72 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None))
73 })
74 new_formats = [f]
75 formats.extend(merge_dicts(f, {
76 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '),
77 'language': meta.get('language'),
78 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1,
79 'quality': qualities(self._QUALITIES)(meta.get('quality')),
80 }) for f in new_formats)
81
82 def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer):
83 ptmd = self._call_api(
84 ptmd_url, video_id, 'metadata', api_token, referrer)
85
86 content_id = ptmd.get('basename') or ptmd_url.split('/')[-1]
87
88 formats = []
89 track_uris = set()
90 for p in ptmd['priorityList']:
91 formitaeten = p.get('formitaeten')
92 if not isinstance(formitaeten, list):
93 continue
94 for f in formitaeten:
95 f_qualities = f.get('qualities')
96 if not isinstance(f_qualities, list):
97 continue
98 for quality in f_qualities:
99 tracks = try_get(quality, lambda x: x['audio']['tracks'], list)
100 if not tracks:
101 continue
102 for track in tracks:
103 self._extract_format(
104 content_id, formats, track_uris, {
105 'url': track.get('uri'),
106 'type': f.get('type'),
107 'mimeType': f.get('mimeType'),
108 'quality': quality.get('quality'),
109 'class': track.get('class'),
110 'language': track.get('language'),
111 })
112 self._sort_formats(formats, ('tbr', 'res', 'quality', 'language_preference'))
113
114 duration = float_or_none(try_get(
115 ptmd, lambda x: x['attributes']['duration']['value']), scale=1000)
116
117 return {
118 'extractor_key': ZDFIE.ie_key(),
119 'id': content_id,
120 'duration': duration,
121 'formats': formats,
122 'subtitles': self._extract_subtitles(ptmd),
123 }
124
125 def _extract_player(self, webpage, video_id, fatal=True):
126 return self._parse_json(
127 self._search_regex(
128 r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage,
129 'player JSON', default='{}' if not fatal else NO_DEFAULT,
130 group='json'),
131 video_id)
132
133
134 class ZDFIE(ZDFBaseIE):
135 _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
136 _TESTS = [{
137 # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
138 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
139 'md5': '34ec321e7eb34231fd88616c65c92db0',
140 'info_dict': {
141 'id': '210222_phx_nachgehakt_corona_protest',
142 'ext': 'mp4',
143 'title': 'Wohin führt der Protest in der Pandemie?',
144 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
145 'duration': 1691,
146 'timestamp': 1613948400,
147 'upload_date': '20210221',
148 },
149 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
150 }, {
151 # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
152 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
153 'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
154 'info_dict': {
155 'id': '141007_ab18_10wochensommer_film',
156 'ext': 'mp4',
157 'title': 'Ab 18! - 10 Wochen Sommer',
158 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
159 'duration': 2660,
160 'timestamp': 1608604200,
161 'upload_date': '20201222',
162 },
163 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
164 }, {
165 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html',
166 'info_dict': {
167 'id': '211230_sendung_hjo',
168 'ext': 'mp4',
169 'description': 'md5:47dff85977bde9fb8cba9e9c9b929839',
170 'duration': 1890.0,
171 'upload_date': '20211230',
172 'chapters': list,
173 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e',
174 'title': 'heute journal vom 30.12.2021',
175 'timestamp': 1640897100,
176 }
177 }, {
178 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
179 'info_dict': {
180 'id': '151025_magie_farben2_tex',
181 'ext': 'mp4',
182 'title': 'Die Magie der Farben (2/2)',
183 'description': 'md5:a89da10c928c6235401066b60a6d5c1a',
184 'duration': 2615,
185 'timestamp': 1465021200,
186 'upload_date': '20160604',
187 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806',
188 },
189 }, {
190 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
191 'md5': '57af4423db0455a3975d2dc4578536bc',
192 'info_dict': {
193 'ext': 'mp4',
194 'id': 'video_funk_1770473',
195 'duration': 1278,
196 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.',
197 'title': 'Alles ist verzaubert',
198 'timestamp': 1635520560,
199 'upload_date': '20211029',
200 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799',
201 },
202 }, {
203 # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
204 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html',
205 'only_matching': True,
206 }, {
207 # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html
208 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html',
209 'only_matching': True,
210 }, {
211 # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids
212 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html',
213 'only_matching': True,
214 }, {
215 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html',
216 'only_matching': True,
217 }, {
218 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html',
219 'only_matching': True,
220 }, {
221 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
222 'only_matching': True,
223 }, {
224 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html',
225 'info_dict': {
226 'id': 'video_artede_083871-001-A',
227 'ext': 'mp4',
228 'title': 'Tödliche Flucht (1/6)',
229 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315',
230 'duration': 3193.0,
231 'timestamp': 1641355200,
232 'upload_date': '20220105',
233 },
234 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"'
235 }, {
236 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html',
237 'info_dict': {
238 'id': '191205_1800_sendung_sok8',
239 'ext': 'mp4',
240 'title': 'Das Geld anderer Leute',
241 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d',
242 'duration': 2581.0,
243 'timestamp': 1654790700,
244 'upload_date': '20220609',
245 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350',
246 },
247 }]
248
249 def _extract_entry(self, url, player, content, video_id):
250 title = content.get('title') or content['teaserHeadline']
251
252 t = content['mainVideoContent']['http://zdf.de/rels/target']
253
254 ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
255
256 if not ptmd_path:
257 ptmd_path = traverse_obj(
258 t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'),
259 'http://zdf.de/rels/streams/ptmd-template').replace(
260 '{playerId}', 'ngplayer_2_4')
261
262 info = self._extract_ptmd(
263 urljoin(url, ptmd_path), video_id, player['apiToken'], url)
264
265 thumbnails = []
266 layouts = try_get(
267 content, lambda x: x['teaserImageRef']['layouts'], dict)
268 if layouts:
269 for layout_key, layout_url in layouts.items():
270 layout_url = url_or_none(layout_url)
271 if not layout_url:
272 continue
273 thumbnail = {
274 'url': layout_url,
275 'format_id': layout_key,
276 }
277 mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key)
278 if mobj:
279 thumbnail.update({
280 'width': int(mobj.group('width')),
281 'height': int(mobj.group('height')),
282 })
283 thumbnails.append(thumbnail)
284
285 chapter_marks = t.get('streamAnchorTag') or []
286 chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))})
287 chapters = [{
288 'start_time': chap.get('anchorOffset'),
289 'end_time': next_chap.get('anchorOffset'),
290 'title': chap.get('anchorLabel')
291 } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])]
292
293 return merge_dicts(info, {
294 'title': title,
295 'description': content.get('leadParagraph') or content.get('teasertext'),
296 'duration': int_or_none(t.get('duration')),
297 'timestamp': unified_timestamp(content.get('editorialDate')),
298 'thumbnails': thumbnails,
299 'chapters': chapters or None
300 })
301
302 def _extract_regular(self, url, player, video_id):
303 content = self._call_api(
304 player['content'], video_id, 'content', player['apiToken'], url)
305 return self._extract_entry(player['content'], player, content, video_id)
306
307 def _extract_mobile(self, video_id):
308 video = self._download_json(
309 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id,
310 video_id)
311
312 document = video['document']
313
314 title = document['titel']
315 content_id = document['basename']
316
317 formats = []
318 format_urls = set()
319 for f in document['formitaeten']:
320 self._extract_format(content_id, formats, format_urls, f)
321 self._sort_formats(formats)
322
323 thumbnails = []
324 teaser_bild = document.get('teaserBild')
325 if isinstance(teaser_bild, dict):
326 for thumbnail_key, thumbnail in teaser_bild.items():
327 thumbnail_url = try_get(
328 thumbnail, lambda x: x['url'], compat_str)
329 if thumbnail_url:
330 thumbnails.append({
331 'url': thumbnail_url,
332 'id': thumbnail_key,
333 'width': int_or_none(thumbnail.get('width')),
334 'height': int_or_none(thumbnail.get('height')),
335 })
336
337 return {
338 'id': content_id,
339 'title': title,
340 'description': document.get('beschreibung'),
341 'duration': int_or_none(document.get('length')),
342 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp(
343 try_get(video, lambda x: x['meta']['editorialDate'], compat_str)),
344 'thumbnails': thumbnails,
345 'subtitles': self._extract_subtitles(document),
346 'formats': formats,
347 }
348
349 def _real_extract(self, url):
350 video_id = self._match_id(url)
351
352 webpage = self._download_webpage(url, video_id, fatal=False)
353 if webpage:
354 player = self._extract_player(webpage, url, fatal=False)
355 if player:
356 return self._extract_regular(url, player, video_id)
357
358 return self._extract_mobile(video_id)
359
360
361 class ZDFChannelIE(ZDFBaseIE):
362 _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
363 _TESTS = [{
364 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio',
365 'info_dict': {
366 'id': 'das-aktuelle-sportstudio',
367 'title': 'das aktuelle sportstudio | ZDF',
368 },
369 'playlist_mincount': 23,
370 }, {
371 'url': 'https://www.zdf.de/dokumentation/planet-e',
372 'info_dict': {
373 'id': 'planet-e',
374 'title': 'planet e.',
375 },
376 'playlist_mincount': 50,
377 }, {
378 'url': 'https://www.zdf.de/filme/taunuskrimi/',
379 'only_matching': True,
380 }]
381
382 @classmethod
383 def suitable(cls, url):
384 return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url)
385
386 def _real_extract(self, url):
387 channel_id = self._match_id(url)
388
389 webpage = self._download_webpage(url, channel_id)
390
391 entries = [
392 self.url_result(item_url, ie=ZDFIE.ie_key())
393 for item_url in orderedSet(re.findall(
394 r'data-plusbar-url=["\'](http.+?\.html)', webpage))]
395
396 return self.playlist_result(
397 entries, channel_id, self._og_search_title(webpage, fatal=False))
398
399 r"""
400 player = self._extract_player(webpage, channel_id)
401
402 channel_id = self._search_regex(
403 r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage,
404 'channel id', group='id')
405
406 channel = self._call_api(
407 'https://api.zdf.de/content/documents/%s.json' % channel_id,
408 player, url, channel_id)
409
410 items = []
411 for module in channel['module']:
412 for teaser in try_get(module, lambda x: x['teaser'], list) or []:
413 t = try_get(
414 teaser, lambda x: x['http://zdf.de/rels/target'], dict)
415 if not t:
416 continue
417 items.extend(try_get(
418 t,
419 lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'],
420 list) or [])
421 items.extend(try_get(
422 module,
423 lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'],
424 list) or [])
425
426 entries = []
427 entry_urls = set()
428 for item in items:
429 t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict)
430 if not t:
431 continue
432 sharing_url = t.get('http://zdf.de/rels/sharing-url')
433 if not sharing_url or not isinstance(sharing_url, compat_str):
434 continue
435 if sharing_url in entry_urls:
436 continue
437 entry_urls.add(sharing_url)
438 entries.append(self.url_result(
439 sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id')))
440
441 return self.playlist_result(entries, channel_id, channel.get('title'))
442 """