]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/ard.py
[ard:mediathek] Add audio test
[yt-dlp.git] / youtube_dl / extractor / ard.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from .generic import GenericIE
8 from ..utils import (
9 determine_ext,
10 ExtractorError,
11 get_element_by_attribute,
12 qualities,
13 int_or_none,
14 parse_duration,
15 unified_strdate,
16 xpath_text,
17 parse_xml,
18 )
19
20
21 class ARDMediathekIE(InfoExtractor):
22 IE_NAME = 'ARD:mediathek'
23 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
24
25 _TESTS = [{
26 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
27 'only_matching': True,
28 }, {
29 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
30 'info_dict': {
31 'id': '22490580',
32 'ext': 'mp4',
33 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
34 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
35 },
36 'skip': 'Blocked outside of Germany',
37 }, {
38 # audio
39 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
40 'md5': '219d94d8980b4f538c7fcb0865eb7f2c',
41 'info_dict': {
42 'id': '28488308',
43 'ext': 'mp3',
44 'title': 'Tod eines Fußballers',
45 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
46 'duration': 3240,
47 },
48 }]
49
50 def _extract_media_info(self, media_info_url, webpage, video_id):
51 media_info = self._download_json(
52 media_info_url, video_id, 'Downloading media JSON')
53
54 formats = self._extract_formats(media_info, video_id)
55
56 if not formats:
57 if '"fsk"' in webpage:
58 raise ExtractorError(
59 'This video is only available after 20:00', expected=True)
60 elif media_info.get('_geoblocked'):
61 raise ExtractorError('This video is not available due to geo restriction', expected=True)
62
63 self._sort_formats(formats)
64
65 duration = int_or_none(media_info.get('_duration'))
66 thumbnail = media_info.get('_previewImage')
67
68 subtitles = {}
69 subtitle_url = media_info.get('_subtitleUrl')
70 if subtitle_url:
71 subtitles['de'] = [{
72 'ext': 'srt',
73 'url': subtitle_url,
74 }]
75
76 return {
77 'id': video_id,
78 'duration': duration,
79 'thumbnail': thumbnail,
80 'formats': formats,
81 'subtitles': subtitles,
82 }
83
84 def _extract_formats(self, media_info, video_id):
85 type_ = media_info.get('_type')
86 media_array = media_info.get('_mediaArray', [])
87 formats = []
88 for num, media in enumerate(media_array):
89 for stream in media.get('_mediaStreamArray', []):
90 stream_urls = stream.get('_stream')
91 if not stream_urls:
92 continue
93 if not isinstance(stream_urls, list):
94 stream_urls = [stream_urls]
95 quality = stream.get('_quality')
96 server = stream.get('_server')
97 for stream_url in stream_urls:
98 ext = determine_ext(stream_url)
99 if ext == 'f4m':
100 formats.extend(self._extract_f4m_formats(
101 stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
102 video_id, preference=-1, f4m_id='hds'))
103 elif ext == 'm3u8':
104 formats.extend(self._extract_m3u8_formats(
105 stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
106 else:
107 if server and server.startswith('rtmp'):
108 f = {
109 'url': server,
110 'play_path': stream_url,
111 'format_id': 'a%s-rtmp-%s' % (num, quality),
112 }
113 elif stream_url.startswith('http'):
114 f = {
115 'url': stream_url,
116 'format_id': 'a%s-%s-%s' % (num, ext, quality)
117 }
118 else:
119 continue
120 m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
121 if m:
122 f.update({
123 'width': int(m.group('width')),
124 'height': int(m.group('height')),
125 })
126 if type_ == 'audio':
127 f['vcodec'] = 'none'
128 formats.append(f)
129 return formats
130
131 def _real_extract(self, url):
132 # determine video id from url
133 m = re.match(self._VALID_URL, url)
134
135 numid = re.search(r'documentId=([0-9]+)', url)
136 if numid:
137 video_id = numid.group(1)
138 else:
139 video_id = m.group('video_id')
140
141 webpage = self._download_webpage(url, video_id)
142
143 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
144 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
145
146 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
147 raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
148
149 if re.search(r'[\?&]rss($|[=&])', url):
150 doc = parse_xml(webpage)
151 if doc.tag == 'rss':
152 return GenericIE()._extract_rss(url, video_id, doc)
153
154 title = self._html_search_regex(
155 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
156 r'<meta name="dcterms.title" content="(.*?)"/>',
157 r'<h4 class="headline">(.*?)</h4>'],
158 webpage, 'title')
159 description = self._html_search_meta(
160 'dcterms.abstract', webpage, 'description', default=None)
161 if description is None:
162 description = self._html_search_meta(
163 'description', webpage, 'meta description')
164
165 # Thumbnail is sometimes not present.
166 # It is in the mobile version, but that seems to use a different URL
167 # structure altogether.
168 thumbnail = self._og_search_thumbnail(webpage, default=None)
169
170 media_streams = re.findall(r'''(?x)
171 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
172 "([^"]+)"''', webpage)
173
174 if media_streams:
175 QUALITIES = qualities(['lo', 'hi', 'hq'])
176 formats = []
177 for furl in set(media_streams):
178 if furl.endswith('.f4m'):
179 fid = 'f4m'
180 else:
181 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
182 fid = fid_m.group(1) if fid_m else None
183 formats.append({
184 'quality': QUALITIES(fid),
185 'format_id': fid,
186 'url': furl,
187 })
188 self._sort_formats(formats)
189 info = {
190 'formats': formats,
191 }
192 else: # request JSON file
193 info = self._extract_media_info(
194 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
195
196 info.update({
197 'id': video_id,
198 'title': title,
199 'description': description,
200 'thumbnail': thumbnail,
201 })
202
203 return info
204
205
206 class ARDIE(InfoExtractor):
207 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
208 _TEST = {
209 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
210 'md5': 'd216c3a86493f9322545e045ddc3eb35',
211 'info_dict': {
212 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
213 'id': '100',
214 'ext': 'mp4',
215 'duration': 2600,
216 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
217 'upload_date': '20140804',
218 'thumbnail': 're:^https?://.*\.jpg$',
219 }
220 }
221
222 def _real_extract(self, url):
223 mobj = re.match(self._VALID_URL, url)
224 display_id = mobj.group('display_id')
225
226 player_url = mobj.group('mainurl') + '~playerXml.xml'
227 doc = self._download_xml(player_url, display_id)
228 video_node = doc.find('./video')
229 upload_date = unified_strdate(xpath_text(
230 video_node, './broadcastDate'))
231 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
232
233 formats = []
234 for a in video_node.findall('.//asset'):
235 f = {
236 'format_id': a.attrib['type'],
237 'width': int_or_none(a.find('./frameWidth').text),
238 'height': int_or_none(a.find('./frameHeight').text),
239 'vbr': int_or_none(a.find('./bitrateVideo').text),
240 'abr': int_or_none(a.find('./bitrateAudio').text),
241 'vcodec': a.find('./codecVideo').text,
242 'tbr': int_or_none(a.find('./totalBitrate').text),
243 }
244 if a.find('./serverPrefix').text:
245 f['url'] = a.find('./serverPrefix').text
246 f['playpath'] = a.find('./fileName').text
247 else:
248 f['url'] = a.find('./fileName').text
249 formats.append(f)
250 self._sort_formats(formats)
251
252 return {
253 'id': mobj.group('id'),
254 'formats': formats,
255 'display_id': display_id,
256 'title': video_node.find('./title').text,
257 'duration': parse_duration(video_node.find('./duration').text),
258 'upload_date': upload_date,
259 'thumbnail': thumbnail,
260 }
261
262
263 class SportschauIE(ARDMediathekIE):
264 IE_NAME = 'Sportschau'
265 _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
266 _TESTS = [{
267 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
268 'info_dict': {
269 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
270 'ext': 'mp4',
271 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
272 'thumbnail': 're:^https?://.*\.jpg$',
273 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
274 },
275 'params': {
276 # m3u8 download
277 'skip_download': True,
278 },
279 }]
280
281 def _real_extract(self, url):
282 mobj = re.match(self._VALID_URL, url)
283 video_id = mobj.group('id')
284 base_url = mobj.group('baseurl')
285
286 webpage = self._download_webpage(url, video_id)
287 title = get_element_by_attribute('class', 'headline', webpage)
288 description = self._html_search_meta('description', webpage, 'description')
289
290 info = self._extract_media_info(
291 base_url + '-mc_defaultQuality-h.json', webpage, video_id)
292
293 info.update({
294 'title': title,
295 'description': description,
296 })
297
298 return info