]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ard.py
Merge pull request #3911 from Dockheas23/master
[yt-dlp.git] / youtube_dl / extractor / ard.py
CommitLineData
f9b85496
PH
1# coding: utf-8
2from __future__ import unicode_literals
3
d5822b96
PH
4import re
5
6from .common import InfoExtractor
7from ..utils import (
f9b85496 8 determine_ext,
d5822b96 9 ExtractorError,
29546b34 10 qualities,
6d3d3fc0
PH
11 int_or_none,
12 parse_duration,
13 unified_strdate,
bf0ff932 14 xpath_text,
d5822b96
PH
15)
16
f9b85496 17
6d3d3fc0
PH
18class ARDMediathekIE(InfoExtractor):
19 IE_NAME = 'ARD:mediathek'
29546b34 20 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
f9b85496 21
29546b34
PH
22 _TESTS = [{
23 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
24 'file': '22429276.mp4',
25 'md5': '469751912f1de0816a9fc9df8336476c',
f9b85496 26 'info_dict': {
29546b34
PH
27 'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?',
28 'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014',
6f5ac90c 29 },
f9b85496 30 'skip': 'Blocked outside of Germany',
29546b34
PH
31 }, {
32 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
33 'info_dict': {
34 'id': '22490580',
35 'ext': 'mp4',
36 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
37 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
38 },
39 'skip': 'Blocked outside of Germany',
40 }]
d5822b96
PH
41
42 def _real_extract(self, url):
43 # determine video id from url
44 m = re.match(self._VALID_URL, url)
45
46 numid = re.search(r'documentId=([0-9]+)', url)
47 if numid:
48 video_id = numid.group(1)
49 else:
50 video_id = m.group('video_id')
51
5622f29a 52 webpage = self._download_webpage(url, video_id)
f9b85496 53
3a5beb0c
S
54 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
55 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
56
f9b85496 57 title = self._html_search_regex(
0f97c9a0 58 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
6a3fa81f 59 r'<meta name="dcterms.title" content="(.*?)"/>',
0f97c9a0
PH
60 r'<h4 class="headline">(.*?)</h4>'],
61 webpage, 'title')
f9b85496 62 description = self._html_search_meta(
29546b34
PH
63 'dcterms.abstract', webpage, 'description', default=None)
64 if description is None:
65 description = self._html_search_meta(
66 'description', webpage, 'meta description')
67
68 # Thumbnail is sometimes not present.
69 # It is in the mobile version, but that seems to use a different URL
70 # structure altogether.
71 thumbnail = self._og_search_thumbnail(webpage, default=None)
72
73 media_streams = re.findall(r'''(?x)
74 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
75 "([^"]+)"''', webpage)
76
77 if media_streams:
78 QUALITIES = qualities(['lo', 'hi', 'hq'])
79 formats = []
80 for furl in set(media_streams):
81 if furl.endswith('.f4m'):
82 fid = 'f4m'
83 else:
84 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
85 fid = fid_m.group(1) if fid_m else None
86 formats.append({
87 'quality': QUALITIES(fid),
88 'format_id': fid,
89 'url': furl,
90 })
91 else: # request JSON file
92 media_info = self._download_json(
93 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
94 # The second element of the _mediaArray contains the standard http urls
95 streams = media_info['_mediaArray'][1]['_mediaStreamArray']
96 if not streams:
97 if '"fsk"' in webpage:
98 raise ExtractorError('This video is only available after 20:00')
99
100 formats = []
101 for s in streams:
102 if type(s['_stream']) == list:
103 for index, url in enumerate(s['_stream'][::-1]):
104 quality = s['_quality'] + index
105 formats.append({
106 'quality': quality,
107 'url': url,
108 'format_id': '%s-%s' % (determine_ext(url), quality)
e5da4021 109 })
29546b34 110 continue
e5da4021 111
29546b34
PH
112 format = {
113 'quality': s['_quality'],
114 'url': s['_stream'],
115 }
6a3fa81f 116
29546b34
PH
117 format['format_id'] = '%s-%s' % (
118 determine_ext(format['url']), format['quality'])
f9b85496 119
29546b34 120 formats.append(format)
f9b85496
PH
121
122 self._sort_formats(formats)
123
124 return {
125 'id': video_id,
126 'title': title,
127 'description': description,
128 'formats': formats,
129 'thumbnail': thumbnail,
130 }
6d3d3fc0
PH
131
132
133class ARDIE(InfoExtractor):
134 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
135 _TEST = {
136 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
137 'md5': 'd216c3a86493f9322545e045ddc3eb35',
138 'info_dict': {
139 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
140 'id': '100',
141 'ext': 'mp4',
142 'duration': 2600,
143 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
144 'upload_date': '20140804',
145 'thumbnail': 're:^https?://.*\.jpg$',
146 }
147 }
148
149 def _real_extract(self, url):
150 mobj = re.match(self._VALID_URL, url)
151 display_id = mobj.group('display_id')
152
153 player_url = mobj.group('mainurl') + '~playerXml.xml'
154 doc = self._download_xml(player_url, display_id)
155 video_node = doc.find('./video')
bf0ff932
PH
156 upload_date = unified_strdate(xpath_text(
157 video_node, './broadcastDate'))
158 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
6d3d3fc0
PH
159
160 formats = []
161 for a in video_node.findall('.//asset'):
162 f = {
163 'format_id': a.attrib['type'],
164 'width': int_or_none(a.find('./frameWidth').text),
165 'height': int_or_none(a.find('./frameHeight').text),
166 'vbr': int_or_none(a.find('./bitrateVideo').text),
167 'abr': int_or_none(a.find('./bitrateAudio').text),
168 'vcodec': a.find('./codecVideo').text,
169 'tbr': int_or_none(a.find('./totalBitrate').text),
170 }
171 if a.find('./serverPrefix').text:
172 f['url'] = a.find('./serverPrefix').text
173 f['playpath'] = a.find('./fileName').text
174 else:
175 f['url'] = a.find('./fileName').text
176 formats.append(f)
177 self._sort_formats(formats)
178
179 return {
180 'id': mobj.group('id'),
181 'formats': formats,
182 'display_id': display_id,
183 'title': video_node.find('./title').text,
184 'duration': parse_duration(video_node.find('./duration').text),
185 'upload_date': upload_date,
186 'thumbnail': thumbnail,
187 }
188