]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tagesschau.py
[ie/youtube] Suppress "Unavailable videos are hidden" warning (#10159)
[yt-dlp.git] / yt_dlp / extractor / tagesschau.py
CommitLineData
0e3ae924 1import re
0e3ae924 2
3from .common import InfoExtractor
1a2b377c 4from ..utils import (
af7585c8 5 UnsupportedError,
0f6e60bb 6 extract_attributes,
0f6e60bb 7 int_or_none,
af7585c8
M
8 js_to_json,
9 parse_iso8601,
10 try_get,
1a2b377c 11)
0e3ae924 12
13
14class TagesschauIE(InfoExtractor):
df773c3d 15 _WORKING = False
854cc54b 16 _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
0e3ae924 17
18 _TESTS = [{
c51bc70e 19 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
af7585c8 20 'md5': 'ccb9359bf8c4795836e43759f3408a93',
0e3ae924 21 'info_dict': {
0f6e60bb 22 'id': 'video-102143-1',
0e3ae924 23 'ext': 'mp4',
c51bc70e 24 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
af7585c8 25 'duration': 138,
0e3ae924 26 },
045c4884
PH
27 }, {
28 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
af7585c8 29 'md5': '5c15e8f3da049e48829ec9786d835536',
045c4884 30 'info_dict': {
0f6e60bb 31 'id': 'ts-5727-1',
045c4884 32 'ext': 'mp4',
0f6e60bb 33 'title': 'Ganze Sendung',
af7585c8 34 'duration': 932,
6a0f9a24
S
35 },
36 }, {
37 # exclusive audio
38 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
af7585c8 39 'md5': '4bff8f23504df56a0d86ed312d654182',
6a0f9a24 40 'info_dict': {
0f6e60bb 41 'id': 'audio-29417-1',
6a0f9a24 42 'ext': 'mp3',
af7585c8 43 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
948199de 44 },
a47b602b 45 }, {
6a0f9a24 46 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
af7585c8 47 'md5': 'f049fa1698d7564e9ca4c3325108f034',
a47b602b 48 'info_dict': {
0f6e60bb 49 'id': 'bnd-303-1',
af7585c8
M
50 'ext': 'mp3',
51 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa',
a47b602b 52 },
4c1b2e5c
S
53 }, {
54 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
55 'info_dict': {
854cc54b 56 'id': 'afd-parteitag-135',
0f6e60bb 57 'title': 'AfD',
58 },
af7585c8 59 'playlist_mincount': 15,
0f6e60bb 60 }, {
61 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
62 'info_dict': {
63 'id': 'audio-29417-1',
64 'ext': 'mp3',
af7585c8
M
65 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
66 },
67 }, {
68 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html',
69 'info_dict': {
70 'id': 'podcast-11km-327',
71 'ext': 'mp3',
72 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen',
73 'upload_date': '20230322',
74 'timestamp': 1679482808,
75 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg',
76 'description': 'md5:dad059931fe4b3693e3656e93a249848',
4c1b2e5c 77 },
e89d7e30
RH
78 }, {
79 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
948199de 80 'only_matching': True,
e89d7e30
RH
81 }, {
82 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
948199de 83 'only_matching': True,
e89d7e30
RH
84 }, {
85 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
948199de
S
86 'only_matching': True,
87 }, {
88 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
89 'only_matching': True,
e89d7e30
RH
90 }, {
91 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
948199de 92 'only_matching': True,
3c6ae8b5
RH
93 }, {
94 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
948199de
S
95 'only_matching': True,
96 }, {
97 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
98 'only_matching': True,
651ad35c
S
99 }, {
100 'url': 'http://www.tagesschau.de/100sekunden/index.html',
101 'only_matching': True,
68bb2fef
S
102 }, {
103 # playlist article with collapsing sections
104 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
105 'only_matching': True,
4a5b4d34
PH
106 }]
107
0e3ae924 108 def _real_extract(self, url):
5ad28e7f 109 mobj = self._match_valid_url(url)
651ad35c 110 video_id = mobj.group('id') or mobj.group('path')
122c2f87 111 display_id = video_id.lstrip('-')
6a0f9a24 112
0e3ae924 113 webpage = self._download_webpage(url, display_id)
114
6a0f9a24
S
115 title = self._html_search_regex(
116 r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
0f6e60bb 117 webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
118
119 entries = []
120 videos = re.findall(r'<div[^>]+>', webpage)
121 num = 0
122 for video in videos:
123 video = extract_attributes(video).get('data-config')
124 if not video:
125 continue
126 video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
127 video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
128 if not video_formats:
129 continue
130 num += 1
131 for video_format in video_formats:
132 media_url = video_format.get('_stream') or ''
133 formats = []
134 if media_url.endswith('master.m3u8'):
135 formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
af7585c8 136 elif media_url.endswith('.mp3'):
0f6e60bb 137 formats = [{
138 'url': media_url,
139 'vcodec': 'none',
140 }]
141 if not formats:
142 continue
6a0f9a24 143 entries.append({
add96eb9 144 'id': f'{display_id}-{num}',
0f6e60bb 145 'title': try_get(video, lambda x: x['mc']['_title']),
146 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
add96eb9 147 'formats': formats,
6a0f9a24 148 })
af7585c8
M
149
150 if not entries:
151 raise UnsupportedError(url)
152
0f6e60bb 153 if len(entries) > 1:
154 return self.playlist_result(entries, display_id, title)
4c1b2e5c 155
0e3ae924 156 return {
157 'id': display_id,
045c4884 158 'title': title,
af7585c8
M
159 'thumbnail': self._og_search_thumbnail(webpage),
160 'formats': entries[0]['formats'],
161 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)),
162 'description': self._og_search_description(webpage),
163 'duration': entries[0]['duration'],
0e3ae924 164 }