]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tagesschau.py
[ie/generic] Improve direct video link ext detection (#8340)
[yt-dlp.git] / yt_dlp / extractor / tagesschau.py
CommitLineData
0e3ae924 1import re
0e3ae924 2
3from .common import InfoExtractor
1a2b377c 4from ..utils import (
af7585c8 5 UnsupportedError,
0f6e60bb 6 extract_attributes,
0f6e60bb 7 int_or_none,
af7585c8
M
8 js_to_json,
9 parse_iso8601,
10 try_get,
1a2b377c 11)
0e3ae924 12
13
14class TagesschauIE(InfoExtractor):
854cc54b 15 _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
0e3ae924 16
17 _TESTS = [{
c51bc70e 18 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
af7585c8 19 'md5': 'ccb9359bf8c4795836e43759f3408a93',
0e3ae924 20 'info_dict': {
0f6e60bb 21 'id': 'video-102143-1',
0e3ae924 22 'ext': 'mp4',
c51bc70e 23 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
af7585c8 24 'duration': 138,
0e3ae924 25 },
045c4884
PH
26 }, {
27 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
af7585c8 28 'md5': '5c15e8f3da049e48829ec9786d835536',
045c4884 29 'info_dict': {
0f6e60bb 30 'id': 'ts-5727-1',
045c4884 31 'ext': 'mp4',
0f6e60bb 32 'title': 'Ganze Sendung',
af7585c8 33 'duration': 932,
6a0f9a24
S
34 },
35 }, {
36 # exclusive audio
37 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
af7585c8 38 'md5': '4bff8f23504df56a0d86ed312d654182',
6a0f9a24 39 'info_dict': {
0f6e60bb 40 'id': 'audio-29417-1',
6a0f9a24 41 'ext': 'mp3',
af7585c8 42 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
948199de 43 },
a47b602b 44 }, {
6a0f9a24 45 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
af7585c8 46 'md5': 'f049fa1698d7564e9ca4c3325108f034',
a47b602b 47 'info_dict': {
0f6e60bb 48 'id': 'bnd-303-1',
af7585c8
M
49 'ext': 'mp3',
50 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa',
a47b602b 51 },
4c1b2e5c
S
52 }, {
53 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
54 'info_dict': {
854cc54b 55 'id': 'afd-parteitag-135',
0f6e60bb 56 'title': 'AfD',
57 },
af7585c8 58 'playlist_mincount': 15,
0f6e60bb 59 }, {
60 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
61 'info_dict': {
62 'id': 'audio-29417-1',
63 'ext': 'mp3',
af7585c8
M
64 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
65 },
66 }, {
67 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html',
68 'info_dict': {
69 'id': 'podcast-11km-327',
70 'ext': 'mp3',
71 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen',
72 'upload_date': '20230322',
73 'timestamp': 1679482808,
74 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg',
75 'description': 'md5:dad059931fe4b3693e3656e93a249848',
4c1b2e5c 76 },
e89d7e30
RH
77 }, {
78 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
948199de 79 'only_matching': True,
e89d7e30
RH
80 }, {
81 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
948199de 82 'only_matching': True,
e89d7e30
RH
83 }, {
84 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
948199de
S
85 'only_matching': True,
86 }, {
87 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
88 'only_matching': True,
e89d7e30
RH
89 }, {
90 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
948199de 91 'only_matching': True,
3c6ae8b5
RH
92 }, {
93 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
948199de
S
94 'only_matching': True,
95 }, {
96 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
97 'only_matching': True,
651ad35c
S
98 }, {
99 'url': 'http://www.tagesschau.de/100sekunden/index.html',
100 'only_matching': True,
68bb2fef
S
101 }, {
102 # playlist article with collapsing sections
103 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
104 'only_matching': True,
4a5b4d34
PH
105 }]
106
0e3ae924 107 def _real_extract(self, url):
5ad28e7f 108 mobj = self._match_valid_url(url)
651ad35c 109 video_id = mobj.group('id') or mobj.group('path')
122c2f87 110 display_id = video_id.lstrip('-')
6a0f9a24 111
0e3ae924 112 webpage = self._download_webpage(url, display_id)
113
6a0f9a24
S
114 title = self._html_search_regex(
115 r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
0f6e60bb 116 webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
117
118 entries = []
119 videos = re.findall(r'<div[^>]+>', webpage)
120 num = 0
121 for video in videos:
122 video = extract_attributes(video).get('data-config')
123 if not video:
124 continue
125 video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
126 video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
127 if not video_formats:
128 continue
129 num += 1
130 for video_format in video_formats:
131 media_url = video_format.get('_stream') or ''
132 formats = []
133 if media_url.endswith('master.m3u8'):
134 formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
af7585c8 135 elif media_url.endswith('.mp3'):
0f6e60bb 136 formats = [{
137 'url': media_url,
138 'vcodec': 'none',
139 }]
140 if not formats:
141 continue
6a0f9a24
S
142 entries.append({
143 'id': '%s-%d' % (display_id, num),
0f6e60bb 144 'title': try_get(video, lambda x: x['mc']['_title']),
145 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
146 'formats': formats
6a0f9a24 147 })
af7585c8
M
148
149 if not entries:
150 raise UnsupportedError(url)
151
0f6e60bb 152 if len(entries) > 1:
153 return self.playlist_result(entries, display_id, title)
4c1b2e5c 154
0e3ae924 155 return {
156 'id': display_id,
045c4884 157 'title': title,
af7585c8
M
158 'thumbnail': self._og_search_thumbnail(webpage),
159 'formats': entries[0]['formats'],
160 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)),
161 'description': self._og_search_description(webpage),
162 'duration': entries[0]['duration'],
0e3ae924 163 }