]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/wdr.py
[WDR] use single quotes for strings
[yt-dlp.git] / youtube_dl / extractor / wdr.py
CommitLineData
7807ee66 1# -*- coding: utf-8 -*-
b461641f
S
2from __future__ import unicode_literals
3
4import re
5
6from .common import InfoExtractor
1cc79574 7from ..compat import (
cd7481a3 8 compat_parse_qs,
becafcbf 9 compat_urlparse,
1cc79574
PH
10)
11from ..utils import (
cd7481a3 12 unified_strdate,
c0837a12 13 ExtractorError,
b461641f
S
14)
15
16
17class WDRIE(InfoExtractor):
14f7a2b8 18 _CURRENT_MAUS_URL = r'https?://www.wdrmaus.de/aktuelle-sendung/(wdr|index).php5'
c0837a12 19 _PAGE_REGEX = r'/mediathek/(?P<media_type>[^/]+)/(?P<type>[^/]+)/(?P<display_id>.+)\.html'
3874e6ea 20 _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
c0837a12
BW
21
22 _JS_URL_REGEX = r'(https?://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)'
b461641f
S
23
24 _TESTS = [
25 {
c0837a12
BW
26 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html',
27 'md5': 'e58c39c3e30077141d258bf588700a7b',
b461641f 28 'info_dict': {
c0837a12 29 'id': 'mdb-1058683',
b461641f 30 'ext': 'flv',
c0837a12
BW
31 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100',
32 'title': 'Geheimnis Aachener Dom',
33 'alt_title': 'Doku am Freitag',
34 'upload_date': '20160304',
35 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318',
36 'is_live': False,
37 'subtitles': {'de': [{
38 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml'
39 }]},
b461641f 40 },
2559b9d0 41 'skip': 'Page Not Found',
b461641f
S
42 },
43 {
c0837a12
BW
44 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html',
45 'md5': 'f4c1f96d01cf285240f53ea4309663d8',
b461641f 46 'info_dict': {
c0837a12 47 'id': 'mdb-1072000',
b461641f 48 'ext': 'mp3',
c0837a12
BW
49 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100',
50 'title': 'Schriftstellerin Juli Zeh',
51 'alt_title': 'WDR 3 Gespräch am Samstag',
52 'upload_date': '20160312',
53 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7',
54 'is_live': False,
55 'subtitles': {}
b461641f 56 },
2559b9d0 57 'skip': 'Page Not Found',
b461641f 58 },
176cf9e0 59 {
c0837a12 60 'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
dd8982f1 61 'info_dict': {
c0837a12
BW
62 'id': 'mdb-103364',
63 'ext': 'flv',
64 'display_id': 'index',
65 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
66 'alt_title': 'WDR Fernsehen Live',
67 'upload_date': None,
68 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
69 'is_live': True,
70 'subtitles': {}
dd8982f1 71 }
b8988b63
AA
72 },
73 {
c0837a12
BW
74 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
75 'playlist_mincount': 10,
b8988b63 76 'info_dict': {
c0837a12 77 'id': 'aktuelle-stunde/aktuelle-stunde-120',
b8988b63 78 },
14f7a2b8
BW
79 },
80 {
81 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
82 'info_dict': {
83 'id': 'mdb-1096487',
84 'ext': 'flv',
85 'upload_date': 're:^[0-9]{8}$',
86 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
87 'description': '- Die Sendung mit der Maus -',
88 },
89 'skip': 'The id changes from week to week because of the new episode'
90 },
b461641f
S
91 ]
92
93 def _real_extract(self, url):
94 mobj = re.match(self._VALID_URL, url)
c0837a12
BW
95 url_type = mobj.group('type')
96 page_url = mobj.group('page_url')
97 display_id = mobj.group('display_id')
98 webpage = self._download_webpage(url, display_id)
b461641f 99
c0837a12 100 js_url = self._search_regex(self._JS_URL_REGEX, webpage, 'js_url', default=None)
b461641f 101
c0837a12 102 if not js_url:
b461641f 103 entries = [
c0837a12 104 self.url_result(page_url + href[0], 'WDR')
03681819 105 for href in re.findall(
c0837a12 106 r'<a href="(%s)"' % self._PAGE_REGEX,
03681819 107 webpage)
b461641f 108 ]
176cf9e0
PH
109
110 if entries: # Playlist page
c0837a12 111 return self.playlist_result(entries, playlist_id=display_id)
176cf9e0 112
c0837a12
BW
113 raise ExtractorError('No downloadable streams found', expected=True)
114
115 js_data = self._download_webpage(js_url, 'metadata')
116 json_data = self._search_regex(r'\(({.*})\)', js_data, 'json')
117 metadata = self._parse_json(json_data, display_id)
118
3874e6ea
BW
119 metadata_tracker_data = metadata['trackerData']
120 metadata_media_resource = metadata['mediaResource']
c0837a12
BW
121
122 formats = []
123
124 # check if the metadata contains a direct URL to a file
3874e6ea 125 metadata_media_alt = metadata_media_resource.get('alt')
c0837a12 126 if metadata_media_alt:
3874e6ea 127 for tag_name in ['videoURL', 'audioURL']:
c0837a12
BW
128 if tag_name in metadata_media_alt:
129 formats.append({
130 'url': metadata_media_alt[tag_name]
131 })
132
133 # check if there are flash-streams for this video
3874e6ea
BW
134 if 'dflt' in metadata_media_resource and 'videoURL' in metadata_media_resource['dflt']:
135 video_url = metadata_media_resource['dflt']['videoURL']
c0837a12
BW
136 if video_url.endswith('.f4m'):
137 full_video_url = video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
138 formats.extend(self._extract_f4m_formats(full_video_url, display_id, f4m_id='hds', fatal=False))
139 elif video_url.endswith('.smil'):
140 formats.extend(self._extract_smil_formats(video_url, 'stream', fatal=False))
141
142 subtitles = {}
3874e6ea 143 caption_url = metadata_media_resource.get('captionURL')
c0837a12
BW
144 if caption_url:
145 subtitles['de'] = [{
146 'url': caption_url
147 }]
148
3874e6ea 149 title = metadata_tracker_data.get('trackerClipTitle')
c0837a12 150 is_live = url_type == 'live'
b8988b63
AA
151
152 if is_live:
153 title = self._live_title(title)
c0837a12
BW
154 upload_date = None
155 elif 'trackerClipAirTime' in metadata_tracker_data:
156 upload_date = metadata_tracker_data['trackerClipAirTime']
b461641f 157 else:
c0837a12 158 upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
b461641f
S
159
160 if upload_date:
161 upload_date = unified_strdate(upload_date)
162
2559b9d0 163 self._sort_formats(formats)
b461641f 164
b461641f 165 return {
3874e6ea 166 'id': metadata_tracker_data.get('trackerClipId', display_id),
c0837a12 167 'display_id': display_id,
b461641f 168 'title': title,
3874e6ea 169 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'),
c0837a12 170 'formats': formats,
b461641f 171 'upload_date': upload_date,
3874e6ea 172 'description': self._html_search_meta('Description', webpage),
c0837a12
BW
173 'is_live': is_live,
174 'subtitles': subtitles,
cd7481a3
PH
175 }
176
177
e4cbb5f3
PH
178class WDRMobileIE(InfoExtractor):
179 _VALID_URL = r'''(?x)
180 https?://mobile-ondemand\.wdr\.de/
181 .*?/fsk(?P<age_limit>[0-9]+)
182 /[0-9]+/[0-9]+/
183 (?P<id>[0-9]+)_(?P<title>[0-9]+)'''
184 IE_NAME = 'wdr:mobile'
185 _TEST = {
186 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4',
187 'info_dict': {
188 'title': '4283021',
189 'id': '421735',
7807ee66 190 'ext': 'mp4',
e4cbb5f3
PH
191 'age_limit': 0,
192 },
7807ee66 193 'skip': 'Problems with loading data.'
e4cbb5f3
PH
194 }
195
196 def _real_extract(self, url):
197 mobj = re.match(self._VALID_URL, url)
198 return {
199 'id': mobj.group('id'),
200 'title': mobj.group('title'),
201 'age_limit': int(mobj.group('age_limit')),
202 'url': url,
e1554a40
JMF
203 'http_headers': {
204 'User-Agent': 'mobile',
205 },
e4cbb5f3
PH
206 }
207
208
cd7481a3 209class WDRMausIE(InfoExtractor):
14f7a2b8 210 _VALID_URL = 'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)((?<!index)\.php5|/(?:$|[?#]))'
cd7481a3
PH
211 IE_DESC = 'Sendung mit der Maus'
212 _TESTS = [{
14f7a2b8
BW
213 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5',
214 'md5': '178b432d002162a14ccb3e0876741095',
cd7481a3 215 'info_dict': {
14f7a2b8 216 'id': 'achterbahn',
cd7481a3
PH
217 'ext': 'mp4',
218 'thumbnail': 're:^http://.+\.jpg',
14f7a2b8
BW
219 'upload_date': '20131001',
220 'title': '19.09.2013 - Achterbahn',
cd7481a3
PH
221 }
222 }]
223
224 def _real_extract(self, url):
7b6faddf 225 video_id = self._match_id(url)
cd7481a3
PH
226
227 webpage = self._download_webpage(url, video_id)
228 param_code = self._html_search_regex(
229 r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
230
231 title_date = self._search_regex(
232 r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
233 webpage, 'air date')
234 title_str = self._html_search_regex(
235 r'<h1>(.*?)</h1>', webpage, 'title')
236 title = '%s - %s' % (title_date, title_str)
237 upload_date = unified_strdate(
238 self._html_search_meta('dc.date', webpage))
239
240 fields = compat_parse_qs(param_code)
241 video_url = fields['firstVideo'][0]
242 thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
243
244 formats = [{
245 'format_id': 'rtmp',
246 'url': video_url,
247 }]
248
249 jscode = self._download_webpage(
250 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
251 video_id, fatal=False,
252 note='Downloading URL translation table',
253 errnote='Could not download URL translation table')
254 if jscode:
255 for m in re.finditer(
256 r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
257 jscode):
258 if video_url.startswith(m.group('stream')):
259 http_url = video_url.replace(
260 m.group('stream'), m.group('dl'))
261 formats.append({
262 'format_id': 'http',
263 'url': http_url,
264 })
265 break
266
267 self._sort_formats(formats)
268
269 return {
270 'id': video_id,
271 'title': title,
272 'formats': formats,
273 'thumbnail': thumbnail,
274 'upload_date': upload_date,
275 }