]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ard.py
[youtube] Modernize
[yt-dlp.git] / youtube_dl / extractor / ard.py
CommitLineData
f9b85496
PH
1# coding: utf-8
2from __future__ import unicode_literals
3
d5822b96
PH
4import re
5
6from .common import InfoExtractor
7from ..utils import (
f9b85496 8 determine_ext,
d5822b96 9 ExtractorError,
29546b34 10 qualities,
5622f29a
S
11 compat_urllib_parse_urlparse,
12 compat_urllib_parse,
6d3d3fc0
PH
13 int_or_none,
14 parse_duration,
15 unified_strdate,
bf0ff932 16 xpath_text,
d5822b96
PH
17)
18
f9b85496 19
6d3d3fc0
PH
20class ARDMediathekIE(InfoExtractor):
21 IE_NAME = 'ARD:mediathek'
29546b34 22 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
f9b85496 23
29546b34
PH
24 _TESTS = [{
25 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
26 'file': '22429276.mp4',
27 'md5': '469751912f1de0816a9fc9df8336476c',
f9b85496 28 'info_dict': {
29546b34
PH
29 'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?',
30 'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014',
6f5ac90c 31 },
f9b85496 32 'skip': 'Blocked outside of Germany',
29546b34
PH
33 }, {
34 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
35 'info_dict': {
36 'id': '22490580',
37 'ext': 'mp4',
38 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
39 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
40 },
41 'skip': 'Blocked outside of Germany',
42 }]
d5822b96
PH
43
44 def _real_extract(self, url):
45 # determine video id from url
46 m = re.match(self._VALID_URL, url)
47
48 numid = re.search(r'documentId=([0-9]+)', url)
49 if numid:
50 video_id = numid.group(1)
51 else:
52 video_id = m.group('video_id')
53
5622f29a 54 webpage = self._download_webpage(url, video_id)
f9b85496 55
3a5beb0c
S
56 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
57 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
58
f9b85496 59 title = self._html_search_regex(
0f97c9a0 60 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
6a3fa81f 61 r'<meta name="dcterms.title" content="(.*?)"/>',
0f97c9a0
PH
62 r'<h4 class="headline">(.*?)</h4>'],
63 webpage, 'title')
f9b85496 64 description = self._html_search_meta(
29546b34
PH
65 'dcterms.abstract', webpage, 'description', default=None)
66 if description is None:
67 description = self._html_search_meta(
68 'description', webpage, 'meta description')
69
70 # Thumbnail is sometimes not present.
71 # It is in the mobile version, but that seems to use a different URL
72 # structure altogether.
73 thumbnail = self._og_search_thumbnail(webpage, default=None)
74
75 media_streams = re.findall(r'''(?x)
76 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
77 "([^"]+)"''', webpage)
78
79 if media_streams:
80 QUALITIES = qualities(['lo', 'hi', 'hq'])
81 formats = []
82 for furl in set(media_streams):
83 if furl.endswith('.f4m'):
84 fid = 'f4m'
85 else:
86 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
87 fid = fid_m.group(1) if fid_m else None
88 formats.append({
89 'quality': QUALITIES(fid),
90 'format_id': fid,
91 'url': furl,
92 })
93 else: # request JSON file
94 media_info = self._download_json(
95 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
96 # The second element of the _mediaArray contains the standard http urls
97 streams = media_info['_mediaArray'][1]['_mediaStreamArray']
98 if not streams:
99 if '"fsk"' in webpage:
100 raise ExtractorError('This video is only available after 20:00')
101
102 formats = []
103 for s in streams:
104 if type(s['_stream']) == list:
105 for index, url in enumerate(s['_stream'][::-1]):
106 quality = s['_quality'] + index
107 formats.append({
108 'quality': quality,
109 'url': url,
110 'format_id': '%s-%s' % (determine_ext(url), quality)
e5da4021 111 })
29546b34 112 continue
e5da4021 113
29546b34
PH
114 format = {
115 'quality': s['_quality'],
116 'url': s['_stream'],
117 }
6a3fa81f 118
29546b34
PH
119 format['format_id'] = '%s-%s' % (
120 determine_ext(format['url']), format['quality'])
f9b85496 121
29546b34 122 formats.append(format)
f9b85496
PH
123
124 self._sort_formats(formats)
125
126 return {
127 'id': video_id,
128 'title': title,
129 'description': description,
130 'formats': formats,
131 'thumbnail': thumbnail,
132 }
6d3d3fc0
PH
133
134
135class ARDIE(InfoExtractor):
136 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
137 _TEST = {
138 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
139 'md5': 'd216c3a86493f9322545e045ddc3eb35',
140 'info_dict': {
141 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
142 'id': '100',
143 'ext': 'mp4',
144 'duration': 2600,
145 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
146 'upload_date': '20140804',
147 'thumbnail': 're:^https?://.*\.jpg$',
148 }
149 }
150
151 def _real_extract(self, url):
152 mobj = re.match(self._VALID_URL, url)
153 display_id = mobj.group('display_id')
154
155 player_url = mobj.group('mainurl') + '~playerXml.xml'
156 doc = self._download_xml(player_url, display_id)
157 video_node = doc.find('./video')
bf0ff932
PH
158 upload_date = unified_strdate(xpath_text(
159 video_node, './broadcastDate'))
160 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
6d3d3fc0
PH
161
162 formats = []
163 for a in video_node.findall('.//asset'):
164 f = {
165 'format_id': a.attrib['type'],
166 'width': int_or_none(a.find('./frameWidth').text),
167 'height': int_or_none(a.find('./frameHeight').text),
168 'vbr': int_or_none(a.find('./bitrateVideo').text),
169 'abr': int_or_none(a.find('./bitrateAudio').text),
170 'vcodec': a.find('./codecVideo').text,
171 'tbr': int_or_none(a.find('./totalBitrate').text),
172 }
173 if a.find('./serverPrefix').text:
174 f['url'] = a.find('./serverPrefix').text
175 f['playpath'] = a.find('./fileName').text
176 else:
177 f['url'] = a.find('./fileName').text
178 formats.append(f)
179 self._sort_formats(formats)
180
181 return {
182 'id': mobj.group('id'),
183 'formats': formats,
184 'display_id': display_id,
185 'title': video_node.find('./title').text,
186 'duration': parse_duration(video_node.find('./duration').text),
187 'upload_date': upload_date,
188 'thumbnail': thumbnail,
189 }
190