]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ard.py
[generic] Prevent <video> search from skipping over empty sources (#3546)
[yt-dlp.git] / youtube_dl / extractor / ard.py
CommitLineData
f9b85496
PH
1# coding: utf-8
2from __future__ import unicode_literals
3
d5822b96
PH
4import re
5
6from .common import InfoExtractor
7from ..utils import (
f9b85496 8 determine_ext,
d5822b96 9 ExtractorError,
29546b34 10 qualities,
5622f29a
S
11 compat_urllib_parse_urlparse,
12 compat_urllib_parse,
d5822b96
PH
13)
14
f9b85496 15
d5822b96 16class ARDIE(InfoExtractor):
29546b34 17 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
f9b85496 18
29546b34
PH
19 _TESTS = [{
20 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
21 'file': '22429276.mp4',
22 'md5': '469751912f1de0816a9fc9df8336476c',
f9b85496 23 'info_dict': {
29546b34
PH
24 'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?',
25 'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014',
6f5ac90c 26 },
f9b85496 27 'skip': 'Blocked outside of Germany',
29546b34
PH
28 }, {
29 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
30 'info_dict': {
31 'id': '22490580',
32 'ext': 'mp4',
33 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
34 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
35 },
36 'skip': 'Blocked outside of Germany',
37 }]
d5822b96
PH
38
39 def _real_extract(self, url):
40 # determine video id from url
41 m = re.match(self._VALID_URL, url)
42
43 numid = re.search(r'documentId=([0-9]+)', url)
44 if numid:
45 video_id = numid.group(1)
46 else:
47 video_id = m.group('video_id')
48
5622f29a
S
49 urlp = compat_urllib_parse_urlparse(url)
50 url = urlp._replace(path=compat_urllib_parse.quote(urlp.path.encode('utf-8'))).geturl()
51
52 webpage = self._download_webpage(url, video_id)
f9b85496 53
3a5beb0c
S
54 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
55 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
56
f9b85496 57 title = self._html_search_regex(
0f97c9a0 58 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
6a3fa81f 59 r'<meta name="dcterms.title" content="(.*?)"/>',
0f97c9a0
PH
60 r'<h4 class="headline">(.*?)</h4>'],
61 webpage, 'title')
f9b85496 62 description = self._html_search_meta(
29546b34
PH
63 'dcterms.abstract', webpage, 'description', default=None)
64 if description is None:
65 description = self._html_search_meta(
66 'description', webpage, 'meta description')
67
68 # Thumbnail is sometimes not present.
69 # It is in the mobile version, but that seems to use a different URL
70 # structure altogether.
71 thumbnail = self._og_search_thumbnail(webpage, default=None)
72
73 media_streams = re.findall(r'''(?x)
74 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
75 "([^"]+)"''', webpage)
76
77 if media_streams:
78 QUALITIES = qualities(['lo', 'hi', 'hq'])
79 formats = []
80 for furl in set(media_streams):
81 if furl.endswith('.f4m'):
82 fid = 'f4m'
83 else:
84 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
85 fid = fid_m.group(1) if fid_m else None
86 formats.append({
87 'quality': QUALITIES(fid),
88 'format_id': fid,
89 'url': furl,
90 })
91 else: # request JSON file
92 media_info = self._download_json(
93 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
94 # The second element of the _mediaArray contains the standard http urls
95 streams = media_info['_mediaArray'][1]['_mediaStreamArray']
96 if not streams:
97 if '"fsk"' in webpage:
98 raise ExtractorError('This video is only available after 20:00')
99
100 formats = []
101 for s in streams:
102 if type(s['_stream']) == list:
103 for index, url in enumerate(s['_stream'][::-1]):
104 quality = s['_quality'] + index
105 formats.append({
106 'quality': quality,
107 'url': url,
108 'format_id': '%s-%s' % (determine_ext(url), quality)
e5da4021 109 })
29546b34 110 continue
e5da4021 111
29546b34
PH
112 format = {
113 'quality': s['_quality'],
114 'url': s['_stream'],
115 }
6a3fa81f 116
29546b34
PH
117 format['format_id'] = '%s-%s' % (
118 determine_ext(format['url']), format['quality'])
f9b85496 119
29546b34 120 formats.append(format)
f9b85496
PH
121
122 self._sort_formats(formats)
123
124 return {
125 'id': video_id,
126 'title': title,
127 'description': description,
128 'formats': formats,
129 'thumbnail': thumbnail,
130 }