]>
Commit | Line | Data |
---|---|---|
dcdb292f | 1 | # coding: utf-8 |
0e3ae924 | 2 | from __future__ import unicode_literals |
3 | ||
4 | import re | |
0e3ae924 | 5 | |
6 | from .common import InfoExtractor | |
1a2b377c | 7 | from ..utils import ( |
6a0f9a24 | 8 | js_to_json, |
0f6e60bb | 9 | extract_attributes, |
10 | try_get, | |
11 | int_or_none, | |
1a2b377c | 12 | ) |
0e3ae924 | 13 | |
14 | ||
15 | class TagesschauIE(InfoExtractor): | |
854cc54b | 16 | _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' |
0e3ae924 | 17 | |
18 | _TESTS = [{ | |
c51bc70e | 19 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', |
0f6e60bb | 20 | 'md5': '7a7287612fa881a1ae1d087df45c2fd6', |
0e3ae924 | 21 | 'info_dict': { |
0f6e60bb | 22 | 'id': 'video-102143-1', |
0e3ae924 | 23 | 'ext': 'mp4', |
c51bc70e | 24 | 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', |
0e3ae924 | 25 | }, |
045c4884 PH |
26 | }, { |
27 | 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', | |
28 | 'md5': '3c54c1f6243d279b706bde660ceec633', | |
29 | 'info_dict': { | |
0f6e60bb | 30 | 'id': 'ts-5727-1', |
045c4884 | 31 | 'ext': 'mp4', |
0f6e60bb | 32 | 'title': 'Ganze Sendung', |
6a0f9a24 S |
33 | }, |
34 | }, { | |
35 | # exclusive audio | |
36 | 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', | |
0f6e60bb | 37 | 'md5': '4cf22023c285f35e99c24d290ba58cc9', |
6a0f9a24 | 38 | 'info_dict': { |
0f6e60bb | 39 | 'id': 'audio-29417-1', |
6a0f9a24 | 40 | 'ext': 'mp3', |
0f6e60bb | 41 | 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', |
948199de | 42 | }, |
a47b602b | 43 | }, { |
6a0f9a24 | 44 | 'url': 'http://www.tagesschau.de/inland/bnd-303.html', |
0f6e60bb | 45 | 'md5': '12cfb212d9325b5ba0d52b625f1aa61c', |
a47b602b | 46 | 'info_dict': { |
0f6e60bb | 47 | 'id': 'bnd-303-1', |
48 | 'ext': 'mp4', | |
49 | 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa', | |
a47b602b | 50 | }, |
4c1b2e5c S |
51 | }, { |
52 | 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', | |
53 | 'info_dict': { | |
854cc54b | 54 | 'id': 'afd-parteitag-135', |
0f6e60bb | 55 | 'title': 'AfD', |
56 | }, | |
57 | 'playlist_count': 20, | |
58 | }, { | |
59 | 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', | |
60 | 'info_dict': { | |
61 | 'id': 'audio-29417-1', | |
62 | 'ext': 'mp3', | |
63 | 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', | |
4c1b2e5c | 64 | }, |
e89d7e30 RH |
65 | }, { |
66 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', | |
948199de | 67 | 'only_matching': True, |
e89d7e30 RH |
68 | }, { |
69 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', | |
948199de | 70 | 'only_matching': True, |
e89d7e30 RH |
71 | }, { |
72 | 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', | |
948199de S |
73 | 'only_matching': True, |
74 | }, { | |
75 | 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', | |
76 | 'only_matching': True, | |
e89d7e30 RH |
77 | }, { |
78 | 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', | |
948199de | 79 | 'only_matching': True, |
3c6ae8b5 RH |
80 | }, { |
81 | 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', | |
948199de S |
82 | 'only_matching': True, |
83 | }, { | |
84 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', | |
85 | 'only_matching': True, | |
651ad35c S |
86 | }, { |
87 | 'url': 'http://www.tagesschau.de/100sekunden/index.html', | |
88 | 'only_matching': True, | |
68bb2fef S |
89 | }, { |
90 | # playlist article with collapsing sections | |
91 | 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', | |
92 | 'only_matching': True, | |
4a5b4d34 PH |
93 | }] |
94 | ||
0e3ae924 | 95 | def _real_extract(self, url): |
5ad28e7f | 96 | mobj = self._match_valid_url(url) |
651ad35c | 97 | video_id = mobj.group('id') or mobj.group('path') |
122c2f87 | 98 | display_id = video_id.lstrip('-') |
6a0f9a24 | 99 | |
0e3ae924 | 100 | webpage = self._download_webpage(url, display_id) |
101 | ||
6a0f9a24 S |
102 | title = self._html_search_regex( |
103 | r'<span[^>]*class="headline"[^>]*>(.+?)</span>', | |
0f6e60bb | 104 | webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) |
105 | ||
106 | entries = [] | |
107 | videos = re.findall(r'<div[^>]+>', webpage) | |
108 | num = 0 | |
109 | for video in videos: | |
110 | video = extract_attributes(video).get('data-config') | |
111 | if not video: | |
112 | continue | |
113 | video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False) | |
114 | video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray']) | |
115 | if not video_formats: | |
116 | continue | |
117 | num += 1 | |
118 | for video_format in video_formats: | |
119 | media_url = video_format.get('_stream') or '' | |
120 | formats = [] | |
121 | if media_url.endswith('master.m3u8'): | |
122 | formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') | |
123 | elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'): | |
124 | formats = [{ | |
125 | 'url': media_url, | |
126 | 'vcodec': 'none', | |
127 | }] | |
128 | if not formats: | |
129 | continue | |
6a0f9a24 S |
130 | entries.append({ |
131 | 'id': '%s-%d' % (display_id, num), | |
0f6e60bb | 132 | 'title': try_get(video, lambda x: x['mc']['_title']), |
133 | 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), | |
134 | 'formats': formats | |
6a0f9a24 | 135 | }) |
0f6e60bb | 136 | if len(entries) > 1: |
137 | return self.playlist_result(entries, display_id, title) | |
138 | formats = entries[0]['formats'] | |
139 | video_info = self._search_json_ld(webpage, video_id) | |
140 | description = video_info.get('description') | |
141 | thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail') | |
142 | timestamp = video_info.get('timestamp') | |
143 | title = title or video_info.get('description') | |
4c1b2e5c | 144 | |
0e3ae924 | 145 | self._sort_formats(formats) |
0e3ae924 | 146 | |
147 | return { | |
148 | 'id': display_id, | |
045c4884 PH |
149 | 'title': title, |
150 | 'thumbnail': thumbnail, | |
0e3ae924 | 151 | 'formats': formats, |
0f6e60bb | 152 | 'timestamp': timestamp, |
045c4884 | 153 | 'description': description, |
0e3ae924 | 154 | } |