]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/ard.py
[ard] Extract all formats
[yt-dlp.git] / youtube_dl / extractor / ard.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from .generic import GenericIE
8 from ..utils import (
9 determine_ext,
10 ExtractorError,
11 qualities,
12 int_or_none,
13 parse_duration,
14 unified_strdate,
15 xpath_text,
16 parse_xml,
17 )
18
19
20 class ARDMediathekIE(InfoExtractor):
21 IE_NAME = 'ARD:mediathek'
22 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
23
24 _TESTS = [{
25 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
26 'only_matching': True,
27 }, {
28 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
29 'info_dict': {
30 'id': '22490580',
31 'ext': 'mp4',
32 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
33 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
34 },
35 'skip': 'Blocked outside of Germany',
36 }]
37
38 def _extract_media_info(self, media_info_url, webpage, video_id):
39 media_info = self._download_json(
40 media_info_url, video_id, 'Downloading media JSON')
41
42 formats = self._extract_formats(media_info, video_id)
43
44 if not formats:
45 if '"fsk"' in webpage:
46 raise ExtractorError(
47 'This video is only available after 20:00', expected=True)
48 elif media_info.get('_geoblocked'):
49 raise ExtractorError('This video is not available due to geo restriction', expected=True)
50
51 self._sort_formats(formats)
52
53 duration = int_or_none(media_info.get('_duration'))
54 thumbnail = media_info.get('_previewImage')
55
56 subtitles = {}
57 subtitle_url = media_info.get('_subtitleUrl')
58 if subtitle_url:
59 subtitles['de'] = [{
60 'ext': 'srt',
61 'url': subtitle_url,
62 }]
63
64 return {
65 'id': video_id,
66 'duration': duration,
67 'thumbnail': thumbnail,
68 'formats': formats,
69 'subtitles': subtitles,
70 }
71
72 def _extract_formats(self, media_info, video_id):
73 type_ = media_info.get('_type')
74 media_array = media_info.get('_mediaArray', [])
75 formats = []
76 for num, media in enumerate(media_array):
77 for stream in media.get('_mediaStreamArray', []):
78 stream_urls = stream.get('_stream')
79 if not stream_urls:
80 continue
81 if not isinstance(stream_urls, list):
82 stream_urls = [stream_urls]
83 quality = stream.get('_quality')
84 server = stream.get('_server')
85 for stream_url in stream_urls:
86 ext = determine_ext(stream_url)
87 if ext == 'f4m':
88 formats.extend(self._extract_f4m_formats(
89 stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
90 video_id, preference=-1, f4m_id='hds'))
91 elif ext == 'm3u8':
92 formats.extend(self._extract_m3u8_formats(
93 stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
94 else:
95 if server and server.startswith('rtmp'):
96 f = {
97 'url': server,
98 'play_path': stream_url,
99 'format_id': 'a%s-rtmp-%s' % (num, quality),
100 }
101 elif stream_url.startswith('http'):
102 f = {
103 'url': stream_url,
104 'format_id': 'a%s-%s-%s' % (num, ext, quality)
105 }
106 else:
107 continue
108 m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
109 if m:
110 f.update({
111 'width': int(m.group('width')),
112 'height': int(m.group('height')),
113 })
114 if type_ == 'audio':
115 f['vcodec'] = 'none'
116 formats.append(f)
117 return formats
118
119 def _real_extract(self, url):
120 # determine video id from url
121 m = re.match(self._VALID_URL, url)
122
123 numid = re.search(r'documentId=([0-9]+)', url)
124 if numid:
125 video_id = numid.group(1)
126 else:
127 video_id = m.group('video_id')
128
129 webpage = self._download_webpage(url, video_id)
130
131 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
132 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
133
134 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
135 raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
136
137 if re.search(r'[\?&]rss($|[=&])', url):
138 doc = parse_xml(webpage)
139 if doc.tag == 'rss':
140 return GenericIE()._extract_rss(url, video_id, doc)
141
142 title = self._html_search_regex(
143 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
144 r'<meta name="dcterms.title" content="(.*?)"/>',
145 r'<h4 class="headline">(.*?)</h4>'],
146 webpage, 'title')
147 description = self._html_search_meta(
148 'dcterms.abstract', webpage, 'description', default=None)
149 if description is None:
150 description = self._html_search_meta(
151 'description', webpage, 'meta description')
152
153 # Thumbnail is sometimes not present.
154 # It is in the mobile version, but that seems to use a different URL
155 # structure altogether.
156 thumbnail = self._og_search_thumbnail(webpage, default=None)
157
158 media_streams = re.findall(r'''(?x)
159 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
160 "([^"]+)"''', webpage)
161
162 if media_streams:
163 QUALITIES = qualities(['lo', 'hi', 'hq'])
164 formats = []
165 for furl in set(media_streams):
166 if furl.endswith('.f4m'):
167 fid = 'f4m'
168 else:
169 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
170 fid = fid_m.group(1) if fid_m else None
171 formats.append({
172 'quality': QUALITIES(fid),
173 'format_id': fid,
174 'url': furl,
175 })
176 self._sort_formats(formats)
177 info = {
178 'formats': formats,
179 }
180 else: # request JSON file
181 info = self._extract_media_info(
182 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
183
184 info.update({
185 'id': video_id,
186 'title': title,
187 'description': description,
188 'thumbnail': thumbnail,
189 })
190
191 return info
192
193
194 class ARDIE(InfoExtractor):
195 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
196 _TEST = {
197 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
198 'md5': 'd216c3a86493f9322545e045ddc3eb35',
199 'info_dict': {
200 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
201 'id': '100',
202 'ext': 'mp4',
203 'duration': 2600,
204 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
205 'upload_date': '20140804',
206 'thumbnail': 're:^https?://.*\.jpg$',
207 }
208 }
209
210 def _real_extract(self, url):
211 mobj = re.match(self._VALID_URL, url)
212 display_id = mobj.group('display_id')
213
214 player_url = mobj.group('mainurl') + '~playerXml.xml'
215 doc = self._download_xml(player_url, display_id)
216 video_node = doc.find('./video')
217 upload_date = unified_strdate(xpath_text(
218 video_node, './broadcastDate'))
219 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
220
221 formats = []
222 for a in video_node.findall('.//asset'):
223 f = {
224 'format_id': a.attrib['type'],
225 'width': int_or_none(a.find('./frameWidth').text),
226 'height': int_or_none(a.find('./frameHeight').text),
227 'vbr': int_or_none(a.find('./bitrateVideo').text),
228 'abr': int_or_none(a.find('./bitrateAudio').text),
229 'vcodec': a.find('./codecVideo').text,
230 'tbr': int_or_none(a.find('./totalBitrate').text),
231 }
232 if a.find('./serverPrefix').text:
233 f['url'] = a.find('./serverPrefix').text
234 f['playpath'] = a.find('./fileName').text
235 else:
236 f['url'] = a.find('./fileName').text
237 formats.append(f)
238 self._sort_formats(formats)
239
240 return {
241 'id': mobj.group('id'),
242 'formats': formats,
243 'display_id': display_id,
244 'title': video_node.find('./title').text,
245 'duration': parse_duration(video_node.find('./duration').text),
246 'upload_date': upload_date,
247 'thumbnail': thumbnail,
248 }