]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/phoenix.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / phoenix.py
CommitLineData
ec5e77c5 1import re
8cc3eba7 2
ec5e77c5 3from .youtube import YoutubeIE
4from .zdf import ZDFBaseIE
5from ..compat import compat_str
6from ..utils import (
7 int_or_none,
8 merge_dicts,
b73612a2 9 try_get,
ec5e77c5 10 unified_timestamp,
b73612a2 11 urljoin,
ec5e77c5 12)
c15c8e25 13
ec5e77c5 14
15class PhoenixIE(ZDFBaseIE):
6c4d6609 16 IE_NAME = 'phoenix.de'
ec5e77c5 17 _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P<id>\d+)\.html'
18 _TESTS = [{
19 # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html
20 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html',
21 'md5': '34ec321e7eb34231fd88616c65c92db0',
22 'info_dict': {
23 'id': '210222_phx_nachgehakt_corona_protest',
24 'ext': 'mp4',
25 'title': 'Wohin führt der Protest in der Pandemie?',
26 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
27 'duration': 1691,
b73612a2 28 'timestamp': 1613902500,
ec5e77c5 29 'upload_date': '20210221',
30 'uploader': 'Phoenix',
b73612a2 31 'series': 'corona nachgehakt',
32 'episode': 'Wohin führt der Protest in der Pandemie?',
edd73448 33 },
ec5e77c5 34 }, {
35 # Youtube embed
36 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html',
37 'info_dict': {
38 'id': 'hMQtqFYjomk',
39 'ext': 'mp4',
40 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?',
41 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd',
42 'duration': 3509,
43 'upload_date': '20201219',
44 'uploader': 'phoenix',
45 'uploader_id': 'phoenix',
edd73448 46 },
ec5e77c5 47 'params': {
48 'skip_download': True,
49 },
50 }, {
51 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html',
52 'only_matching': True,
53 }, {
54 # no media
55 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html',
56 'only_matching': True,
57 }, {
58 # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html
59 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche',
60 'only_matching': True,
61 }]
c15c8e25 62
8cc3eba7 63 def _real_extract(self, url):
ec5e77c5 64 article_id = self._match_id(url)
65
66 article = self._download_json(
67 'https://www.phoenix.de/response/id/%s' % article_id, article_id,
68 'Downloading article JSON')
69
70 video = article['absaetze'][0]
71 title = video.get('titel') or article.get('subtitel')
72
73 if video.get('typ') == 'video-youtube':
74 video_id = video['id']
75 return self.url_result(
76 video_id, ie=YoutubeIE.ie_key(), video_id=video_id,
77 video_title=title)
78
79 video_id = compat_str(video.get('basename') or video.get('content'))
80
b73612a2 81 details = self._download_json(
ec5e77c5 82 'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php',
b73612a2 83 video_id, 'Downloading details JSON', query={
ec5e77c5 84 'ak': 'web',
85 'ptmd': 'true',
86 'id': video_id,
87 'profile': 'player2',
88 })
89
b73612a2 90 title = title or details['title']
91 content_id = details['tracking']['nielsen']['content']['assetid']
ec5e77c5 92
93 info = self._extract_ptmd(
94 'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id,
95 content_id, None, url)
96
b73612a2 97 duration = int_or_none(try_get(
98 details, lambda x: x['tracking']['nielsen']['content']['length']))
99 timestamp = unified_timestamp(details.get('editorialDate'))
100 series = try_get(
101 details, lambda x: x['tracking']['nielsen']['content']['program'],
102 compat_str)
103 episode = title if details.get('contentType') == 'episode' else None
ec5e77c5 104
105 thumbnails = []
b73612a2 106 teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {}
107 for thumbnail_key, thumbnail_url in teaser_images.items():
108 thumbnail_url = urljoin(url, thumbnail_url)
ec5e77c5 109 if not thumbnail_url:
110 continue
111 thumbnail = {
112 'url': thumbnail_url,
113 }
b73612a2 114 m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
115 if m:
116 thumbnail['width'] = int(m.group(1))
117 thumbnail['height'] = int(m.group(2))
ec5e77c5 118 thumbnails.append(thumbnail)
119
120 return merge_dicts(info, {
121 'id': content_id,
122 'title': title,
b73612a2 123 'description': details.get('leadParagraph'),
124 'duration': duration,
ec5e77c5 125 'thumbnails': thumbnails,
126 'timestamp': timestamp,
b73612a2 127 'uploader': details.get('tvService'),
128 'series': series,
129 'episode': episode,
ec5e77c5 130 })