]>
Commit | Line | Data |
---|---|---|
0e3ae924 | 1 | import re |
0e3ae924 | 2 | |
3 | from .common import InfoExtractor | |
1a2b377c | 4 | from ..utils import ( |
6a0f9a24 | 5 | js_to_json, |
0f6e60bb | 6 | extract_attributes, |
7 | try_get, | |
8 | int_or_none, | |
1a2b377c | 9 | ) |
0e3ae924 | 10 | |
11 | ||
12 | class TagesschauIE(InfoExtractor): | |
854cc54b | 13 | _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' |
0e3ae924 | 14 | |
15 | _TESTS = [{ | |
c51bc70e | 16 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', |
0f6e60bb | 17 | 'md5': '7a7287612fa881a1ae1d087df45c2fd6', |
0e3ae924 | 18 | 'info_dict': { |
0f6e60bb | 19 | 'id': 'video-102143-1', |
0e3ae924 | 20 | 'ext': 'mp4', |
c51bc70e | 21 | 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', |
0e3ae924 | 22 | }, |
045c4884 PH |
23 | }, { |
24 | 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', | |
25 | 'md5': '3c54c1f6243d279b706bde660ceec633', | |
26 | 'info_dict': { | |
0f6e60bb | 27 | 'id': 'ts-5727-1', |
045c4884 | 28 | 'ext': 'mp4', |
0f6e60bb | 29 | 'title': 'Ganze Sendung', |
6a0f9a24 S |
30 | }, |
31 | }, { | |
32 | # exclusive audio | |
33 | 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', | |
0f6e60bb | 34 | 'md5': '4cf22023c285f35e99c24d290ba58cc9', |
6a0f9a24 | 35 | 'info_dict': { |
0f6e60bb | 36 | 'id': 'audio-29417-1', |
6a0f9a24 | 37 | 'ext': 'mp3', |
0f6e60bb | 38 | 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', |
948199de | 39 | }, |
a47b602b | 40 | }, { |
6a0f9a24 | 41 | 'url': 'http://www.tagesschau.de/inland/bnd-303.html', |
0f6e60bb | 42 | 'md5': '12cfb212d9325b5ba0d52b625f1aa61c', |
a47b602b | 43 | 'info_dict': { |
0f6e60bb | 44 | 'id': 'bnd-303-1', |
45 | 'ext': 'mp4', | |
46 | 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa', | |
a47b602b | 47 | }, |
4c1b2e5c S |
48 | }, { |
49 | 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', | |
50 | 'info_dict': { | |
854cc54b | 51 | 'id': 'afd-parteitag-135', |
0f6e60bb | 52 | 'title': 'AfD', |
53 | }, | |
54 | 'playlist_count': 20, | |
55 | }, { | |
56 | 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', | |
57 | 'info_dict': { | |
58 | 'id': 'audio-29417-1', | |
59 | 'ext': 'mp3', | |
60 | 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', | |
4c1b2e5c | 61 | }, |
e89d7e30 RH |
62 | }, { |
63 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', | |
948199de | 64 | 'only_matching': True, |
e89d7e30 RH |
65 | }, { |
66 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', | |
948199de | 67 | 'only_matching': True, |
e89d7e30 RH |
68 | }, { |
69 | 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', | |
948199de S |
70 | 'only_matching': True, |
71 | }, { | |
72 | 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', | |
73 | 'only_matching': True, | |
e89d7e30 RH |
74 | }, { |
75 | 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', | |
948199de | 76 | 'only_matching': True, |
3c6ae8b5 RH |
77 | }, { |
78 | 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', | |
948199de S |
79 | 'only_matching': True, |
80 | }, { | |
81 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', | |
82 | 'only_matching': True, | |
651ad35c S |
83 | }, { |
84 | 'url': 'http://www.tagesschau.de/100sekunden/index.html', | |
85 | 'only_matching': True, | |
68bb2fef S |
86 | }, { |
87 | # playlist article with collapsing sections | |
88 | 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', | |
89 | 'only_matching': True, | |
4a5b4d34 PH |
90 | }] |
91 | ||
0e3ae924 | 92 | def _real_extract(self, url): |
5ad28e7f | 93 | mobj = self._match_valid_url(url) |
651ad35c | 94 | video_id = mobj.group('id') or mobj.group('path') |
122c2f87 | 95 | display_id = video_id.lstrip('-') |
6a0f9a24 | 96 | |
0e3ae924 | 97 | webpage = self._download_webpage(url, display_id) |
98 | ||
6a0f9a24 S |
99 | title = self._html_search_regex( |
100 | r'<span[^>]*class="headline"[^>]*>(.+?)</span>', | |
0f6e60bb | 101 | webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) |
102 | ||
103 | entries = [] | |
104 | videos = re.findall(r'<div[^>]+>', webpage) | |
105 | num = 0 | |
106 | for video in videos: | |
107 | video = extract_attributes(video).get('data-config') | |
108 | if not video: | |
109 | continue | |
110 | video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False) | |
111 | video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray']) | |
112 | if not video_formats: | |
113 | continue | |
114 | num += 1 | |
115 | for video_format in video_formats: | |
116 | media_url = video_format.get('_stream') or '' | |
117 | formats = [] | |
118 | if media_url.endswith('master.m3u8'): | |
119 | formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') | |
120 | elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'): | |
121 | formats = [{ | |
122 | 'url': media_url, | |
123 | 'vcodec': 'none', | |
124 | }] | |
125 | if not formats: | |
126 | continue | |
6a0f9a24 S |
127 | entries.append({ |
128 | 'id': '%s-%d' % (display_id, num), | |
0f6e60bb | 129 | 'title': try_get(video, lambda x: x['mc']['_title']), |
130 | 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), | |
131 | 'formats': formats | |
6a0f9a24 | 132 | }) |
0f6e60bb | 133 | if len(entries) > 1: |
134 | return self.playlist_result(entries, display_id, title) | |
135 | formats = entries[0]['formats'] | |
136 | video_info = self._search_json_ld(webpage, video_id) | |
137 | description = video_info.get('description') | |
138 | thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail') | |
139 | timestamp = video_info.get('timestamp') | |
140 | title = title or video_info.get('description') | |
4c1b2e5c | 141 | |
0e3ae924 | 142 | return { |
143 | 'id': display_id, | |
045c4884 PH |
144 | 'title': title, |
145 | 'thumbnail': thumbnail, | |
0e3ae924 | 146 | 'formats': formats, |
0f6e60bb | 147 | 'timestamp': timestamp, |
045c4884 | 148 | 'description': description, |
0e3ae924 | 149 | } |