]>
Commit | Line | Data |
---|---|---|
0e3ae924 | 1 | # -*- coding: utf-8 -*- |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
0e3ae924 | 5 | |
6 | from .common import InfoExtractor | |
ca4456ed | 7 | from ..utils import parse_filesize |
0e3ae924 | 8 | |
9 | ||
10 | class TagesschauIE(InfoExtractor): | |
948199de | 11 | _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_[^/#?]+?)?\.html' |
0e3ae924 | 12 | |
13 | _TESTS = [{ | |
c51bc70e RH |
14 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', |
15 | 'md5': '917a228bc7df7850783bc47979673a09', | |
0e3ae924 | 16 | 'info_dict': { |
c51bc70e | 17 | 'id': '102143', |
0e3ae924 | 18 | 'ext': 'mp4', |
c51bc70e RH |
19 | 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', |
20 | 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', | |
3e214851 | 21 | 'thumbnail': 're:^https?:.*\.jpg$', |
0e3ae924 | 22 | }, |
045c4884 PH |
23 | }, { |
24 | 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', | |
25 | 'md5': '3c54c1f6243d279b706bde660ceec633', | |
26 | 'info_dict': { | |
27 | 'id': '5727', | |
28 | 'ext': 'mp4', | |
29 | 'description': 'md5:695c01bfd98b7e313c501386327aea59', | |
30 | 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', | |
3e214851 | 31 | 'thumbnail': 're:^https?:.*\.jpg$', |
948199de | 32 | }, |
a47b602b S |
33 | }, { |
34 | 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', | |
35 | 'md5': 'aef45de271c4bf0a5db834aa40bf774c', | |
36 | 'info_dict': { | |
37 | 'id': '18407', | |
38 | 'ext': 'mp3', | |
39 | 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', | |
40 | 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', | |
41 | 'thumbnail': 're:^https?:.*\.jpg$', | |
42 | }, | |
e89d7e30 RH |
43 | }, { |
44 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', | |
948199de | 45 | 'only_matching': True, |
e89d7e30 RH |
46 | }, { |
47 | 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', | |
948199de | 48 | 'only_matching': True, |
e89d7e30 RH |
49 | }, { |
50 | 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', | |
948199de S |
51 | 'only_matching': True, |
52 | }, { | |
53 | 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', | |
54 | 'only_matching': True, | |
e89d7e30 RH |
55 | }, { |
56 | 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', | |
948199de | 57 | 'only_matching': True, |
3c6ae8b5 RH |
58 | }, { |
59 | 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', | |
948199de S |
60 | 'only_matching': True, |
61 | }, { | |
62 | 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', | |
63 | 'only_matching': True, | |
4a5b4d34 PH |
64 | }] |
65 | ||
66 | _FORMATS = { | |
67 | 's': {'width': 256, 'height': 144, 'quality': 1}, | |
68 | 'm': {'width': 512, 'height': 288, 'quality': 2}, | |
69 | 'l': {'width': 960, 'height': 544, 'quality': 3}, | |
70 | } | |
0e3ae924 | 71 | |
72 | def _real_extract(self, url): | |
122c2f87 PH |
73 | video_id = self._match_id(url) |
74 | display_id = video_id.lstrip('-') | |
0e3ae924 | 75 | webpage = self._download_webpage(url, display_id) |
76 | ||
045c4884 PH |
77 | player_url = self._html_search_meta( |
78 | 'twitter:player', webpage, 'player URL', default=None) | |
79 | if player_url: | |
80 | playerpage = self._download_webpage( | |
81 | player_url, display_id, 'Downloading player page') | |
0e3ae924 | 82 | |
045c4884 | 83 | formats = [] |
a47b602b S |
84 | for media in re.finditer( |
85 | r'''(?x) | |
86 | (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url) | |
87 | ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type) | |
88 | (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))? | |
89 | ''', playerpage): | |
90 | url = media.group('url') | |
91 | type_ = media.group('type') | |
92 | ext = media.group('ext') | |
93 | res = media.group('quality') | |
045c4884 | 94 | f = { |
a47b602b | 95 | 'format_id': '%s_%s' % (res, ext) if res else ext, |
045c4884 PH |
96 | 'url': url, |
97 | 'ext': ext, | |
a47b602b | 98 | 'vcodec': 'none' if type_ == 'audio' else None, |
045c4884 PH |
99 | } |
100 | f.update(self._FORMATS.get(res, {})) | |
101 | formats.append(f) | |
a47b602b | 102 | thumbnail = self._og_search_thumbnail(playerpage) |
045c4884 PH |
103 | title = self._og_search_title(webpage).strip() |
104 | description = self._og_search_description(webpage).strip() | |
105 | else: | |
106 | download_text = self._search_regex( | |
107 | r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>', | |
108 | webpage, 'download links') | |
109 | links = re.finditer( | |
110 | r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', | |
6591fdf5 | 111 | download_text) |
045c4884 PH |
112 | formats = [] |
113 | for l in links: | |
114 | format_id = self._search_regex( | |
115 | r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') | |
116 | format = { | |
117 | 'format_id': format_id, | |
118 | 'url': l.group('url'), | |
119 | 'format_name': l.group('name'), | |
120 | } | |
121 | m = re.match( | |
122 | r'''(?x) | |
123 | Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; | |
124 | (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; | |
125 | (?P<vbr>[0-9]+)kbps&\#10; | |
126 | Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; | |
127 | Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', | |
128 | l.group('title')) | |
129 | if m: | |
130 | format.update({ | |
131 | 'format_note': m.group('audio_desc'), | |
132 | 'vcodec': m.group('vcodec'), | |
133 | 'width': int(m.group('width')), | |
134 | 'height': int(m.group('height')), | |
135 | 'abr': int(m.group('abr')), | |
136 | 'vbr': int(m.group('vbr')), | |
137 | 'filesize_approx': parse_filesize(m.group('filesize_approx')), | |
138 | }) | |
139 | formats.append(format) | |
a47b602b | 140 | thumbnail = self._og_search_thumbnail(webpage) |
045c4884 PH |
141 | description = self._html_search_regex( |
142 | r'(?s)<p class="teasertext">(.*?)</p>', | |
948199de | 143 | webpage, 'description', default=None) |
045c4884 PH |
144 | title = self._html_search_regex( |
145 | r'<span class="headline".*?>(.*?)</span>', webpage, 'title') | |
0e3ae924 | 146 | |
147 | self._sort_formats(formats) | |
0e3ae924 | 148 | |
149 | return { | |
150 | 'id': display_id, | |
045c4884 PH |
151 | 'title': title, |
152 | 'thumbnail': thumbnail, | |
0e3ae924 | 153 | 'formats': formats, |
045c4884 | 154 | 'description': description, |
0e3ae924 | 155 | } |