]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ard.py
[ard] Add rss support
[yt-dlp.git] / youtube_dl / extractor / ard.py
CommitLineData
f9b85496
PH
1# coding: utf-8
2from __future__ import unicode_literals
3
d5822b96
PH
4import re
5
6from .common import InfoExtractor
3741302a 7from .generic import GenericIE
d5822b96 8from ..utils import (
f9b85496 9 determine_ext,
d5822b96 10 ExtractorError,
29546b34 11 qualities,
6d3d3fc0
PH
12 int_or_none,
13 parse_duration,
14 unified_strdate,
bf0ff932 15 xpath_text,
3741302a 16 parse_xml,
d5822b96
PH
17)
18
f9b85496 19
6d3d3fc0
PH
20class ARDMediathekIE(InfoExtractor):
21 IE_NAME = 'ARD:mediathek'
29546b34 22 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
f9b85496 23
29546b34
PH
24 _TESTS = [{
25 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
26 'file': '22429276.mp4',
27 'md5': '469751912f1de0816a9fc9df8336476c',
f9b85496 28 'info_dict': {
29546b34
PH
29 'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?',
30 'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014',
6f5ac90c 31 },
f9b85496 32 'skip': 'Blocked outside of Germany',
29546b34
PH
33 }, {
34 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
35 'info_dict': {
36 'id': '22490580',
37 'ext': 'mp4',
38 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
39 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
40 },
41 'skip': 'Blocked outside of Germany',
42 }]
d5822b96
PH
43
44 def _real_extract(self, url):
45 # determine video id from url
46 m = re.match(self._VALID_URL, url)
47
48 numid = re.search(r'documentId=([0-9]+)', url)
49 if numid:
50 video_id = numid.group(1)
51 else:
52 video_id = m.group('video_id')
53
5622f29a 54 webpage = self._download_webpage(url, video_id)
f9b85496 55
3a5beb0c
S
56 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
57 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
58
3741302a
OE
59 if re.search(r'rss=true', url):
60 doc = parse_xml(webpage)
61 if doc.tag == 'rss':
62 return GenericIE()._extract_rss(url, video_id, doc)
63
f9b85496 64 title = self._html_search_regex(
0f97c9a0 65 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
6a3fa81f 66 r'<meta name="dcterms.title" content="(.*?)"/>',
0f97c9a0
PH
67 r'<h4 class="headline">(.*?)</h4>'],
68 webpage, 'title')
f9b85496 69 description = self._html_search_meta(
29546b34
PH
70 'dcterms.abstract', webpage, 'description', default=None)
71 if description is None:
72 description = self._html_search_meta(
73 'description', webpage, 'meta description')
74
75 # Thumbnail is sometimes not present.
76 # It is in the mobile version, but that seems to use a different URL
77 # structure altogether.
78 thumbnail = self._og_search_thumbnail(webpage, default=None)
79
80 media_streams = re.findall(r'''(?x)
81 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
82 "([^"]+)"''', webpage)
83
84 if media_streams:
85 QUALITIES = qualities(['lo', 'hi', 'hq'])
86 formats = []
87 for furl in set(media_streams):
88 if furl.endswith('.f4m'):
89 fid = 'f4m'
90 else:
91 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
92 fid = fid_m.group(1) if fid_m else None
93 formats.append({
94 'quality': QUALITIES(fid),
95 'format_id': fid,
96 'url': furl,
97 })
98 else: # request JSON file
99 media_info = self._download_json(
100 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
101 # The second element of the _mediaArray contains the standard http urls
102 streams = media_info['_mediaArray'][1]['_mediaStreamArray']
103 if not streams:
104 if '"fsk"' in webpage:
105 raise ExtractorError('This video is only available after 20:00')
106
107 formats = []
108 for s in streams:
109 if type(s['_stream']) == list:
110 for index, url in enumerate(s['_stream'][::-1]):
111 quality = s['_quality'] + index
112 formats.append({
113 'quality': quality,
114 'url': url,
115 'format_id': '%s-%s' % (determine_ext(url), quality)
e5da4021 116 })
29546b34 117 continue
e5da4021 118
29546b34
PH
119 format = {
120 'quality': s['_quality'],
121 'url': s['_stream'],
122 }
6a3fa81f 123
29546b34
PH
124 format['format_id'] = '%s-%s' % (
125 determine_ext(format['url']), format['quality'])
f9b85496 126
29546b34 127 formats.append(format)
f9b85496
PH
128
129 self._sort_formats(formats)
130
131 return {
132 'id': video_id,
133 'title': title,
134 'description': description,
135 'formats': formats,
136 'thumbnail': thumbnail,
137 }
6d3d3fc0
PH
138
139
140class ARDIE(InfoExtractor):
141 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
142 _TEST = {
143 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
144 'md5': 'd216c3a86493f9322545e045ddc3eb35',
145 'info_dict': {
146 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
147 'id': '100',
148 'ext': 'mp4',
149 'duration': 2600,
150 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
151 'upload_date': '20140804',
152 'thumbnail': 're:^https?://.*\.jpg$',
153 }
154 }
155
156 def _real_extract(self, url):
157 mobj = re.match(self._VALID_URL, url)
158 display_id = mobj.group('display_id')
159
160 player_url = mobj.group('mainurl') + '~playerXml.xml'
161 doc = self._download_xml(player_url, display_id)
162 video_node = doc.find('./video')
bf0ff932
PH
163 upload_date = unified_strdate(xpath_text(
164 video_node, './broadcastDate'))
165 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
6d3d3fc0
PH
166
167 formats = []
168 for a in video_node.findall('.//asset'):
169 f = {
170 'format_id': a.attrib['type'],
171 'width': int_or_none(a.find('./frameWidth').text),
172 'height': int_or_none(a.find('./frameHeight').text),
173 'vbr': int_or_none(a.find('./bitrateVideo').text),
174 'abr': int_or_none(a.find('./bitrateAudio').text),
175 'vcodec': a.find('./codecVideo').text,
176 'tbr': int_or_none(a.find('./totalBitrate').text),
177 }
178 if a.find('./serverPrefix').text:
179 f['url'] = a.find('./serverPrefix').text
180 f['playpath'] = a.find('./fileName').text
181 else:
182 f['url'] = a.find('./fileName').text
183 formats.append(f)
184 self._sort_formats(formats)
185
186 return {
187 'id': mobj.group('id'),
188 'formats': formats,
189 'display_id': display_id,
190 'title': video_node.find('./title').text,
191 'duration': parse_duration(video_node.find('./duration').text),
192 'upload_date': upload_date,
193 'thumbnail': thumbnail,
194 }
195