]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/tagesschau.py
[cleanup] Misc (#8598)
[yt-dlp.git] / yt_dlp / extractor / tagesschau.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 UnsupportedError,
6 extract_attributes,
7 int_or_none,
8 js_to_json,
9 parse_iso8601,
10 try_get,
11 )
12
13
14 class TagesschauIE(InfoExtractor):
15 _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
16
17 _TESTS = [{
18 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
19 'md5': 'ccb9359bf8c4795836e43759f3408a93',
20 'info_dict': {
21 'id': 'video-102143-1',
22 'ext': 'mp4',
23 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
24 'duration': 138,
25 },
26 }, {
27 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
28 'md5': '5c15e8f3da049e48829ec9786d835536',
29 'info_dict': {
30 'id': 'ts-5727-1',
31 'ext': 'mp4',
32 'title': 'Ganze Sendung',
33 'duration': 932,
34 },
35 }, {
36 # exclusive audio
37 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
38 'md5': '4bff8f23504df56a0d86ed312d654182',
39 'info_dict': {
40 'id': 'audio-29417-1',
41 'ext': 'mp3',
42 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
43 },
44 }, {
45 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
46 'md5': 'f049fa1698d7564e9ca4c3325108f034',
47 'info_dict': {
48 'id': 'bnd-303-1',
49 'ext': 'mp3',
50 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa',
51 },
52 }, {
53 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
54 'info_dict': {
55 'id': 'afd-parteitag-135',
56 'title': 'AfD',
57 },
58 'playlist_mincount': 15,
59 }, {
60 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
61 'info_dict': {
62 'id': 'audio-29417-1',
63 'ext': 'mp3',
64 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
65 },
66 }, {
67 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html',
68 'info_dict': {
69 'id': 'podcast-11km-327',
70 'ext': 'mp3',
71 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen',
72 'upload_date': '20230322',
73 'timestamp': 1679482808,
74 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg',
75 'description': 'md5:dad059931fe4b3693e3656e93a249848',
76 },
77 }, {
78 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
79 'only_matching': True,
80 }, {
81 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
82 'only_matching': True,
83 }, {
84 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
85 'only_matching': True,
86 }, {
87 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
88 'only_matching': True,
89 }, {
90 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
91 'only_matching': True,
92 }, {
93 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
94 'only_matching': True,
95 }, {
96 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
97 'only_matching': True,
98 }, {
99 'url': 'http://www.tagesschau.de/100sekunden/index.html',
100 'only_matching': True,
101 }, {
102 # playlist article with collapsing sections
103 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
104 'only_matching': True,
105 }]
106
107 def _real_extract(self, url):
108 mobj = self._match_valid_url(url)
109 video_id = mobj.group('id') or mobj.group('path')
110 display_id = video_id.lstrip('-')
111
112 webpage = self._download_webpage(url, display_id)
113
114 title = self._html_search_regex(
115 r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
116 webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
117
118 entries = []
119 videos = re.findall(r'<div[^>]+>', webpage)
120 num = 0
121 for video in videos:
122 video = extract_attributes(video).get('data-config')
123 if not video:
124 continue
125 video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
126 video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
127 if not video_formats:
128 continue
129 num += 1
130 for video_format in video_formats:
131 media_url = video_format.get('_stream') or ''
132 formats = []
133 if media_url.endswith('master.m3u8'):
134 formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
135 elif media_url.endswith('.mp3'):
136 formats = [{
137 'url': media_url,
138 'vcodec': 'none',
139 }]
140 if not formats:
141 continue
142 entries.append({
143 'id': '%s-%d' % (display_id, num),
144 'title': try_get(video, lambda x: x['mc']['_title']),
145 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
146 'formats': formats
147 })
148
149 if not entries:
150 raise UnsupportedError(url)
151
152 if len(entries) > 1:
153 return self.playlist_result(entries, display_id, title)
154
155 return {
156 'id': display_id,
157 'title': title,
158 'thumbnail': self._og_search_thumbnail(webpage),
159 'formats': entries[0]['formats'],
160 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)),
161 'description': self._og_search_description(webpage),
162 'duration': entries[0]['duration'],
163 }