]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/ard.py
[ard] Encode url (Closes #3412)
[yt-dlp.git] / youtube_dl / extractor / ard.py
CommitLineData
f9b85496
PH
1# coding: utf-8
2from __future__ import unicode_literals
3
d5822b96
PH
4import re
5
6from .common import InfoExtractor
7from ..utils import (
f9b85496 8 determine_ext,
d5822b96 9 ExtractorError,
29546b34 10 qualities,
d5822b96
PH
11)
12
f9b85496 13
d5822b96 14class ARDIE(InfoExtractor):
29546b34 15 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
f9b85496 16
29546b34
PH
17 _TESTS = [{
18 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
19 'file': '22429276.mp4',
20 'md5': '469751912f1de0816a9fc9df8336476c',
f9b85496 21 'info_dict': {
29546b34
PH
22 'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?',
23 'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014',
6f5ac90c 24 },
f9b85496 25 'skip': 'Blocked outside of Germany',
29546b34
PH
26 }, {
27 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
28 'info_dict': {
29 'id': '22490580',
30 'ext': 'mp4',
31 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
32 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
33 },
34 'skip': 'Blocked outside of Germany',
35 }]
d5822b96
PH
36
37 def _real_extract(self, url):
38 # determine video id from url
39 m = re.match(self._VALID_URL, url)
40
41 numid = re.search(r'documentId=([0-9]+)', url)
42 if numid:
43 video_id = numid.group(1)
44 else:
45 video_id = m.group('video_id')
46
b4f23afb 47 webpage = self._download_webpage(url.encode('utf-8'), video_id)
f9b85496
PH
48
49 title = self._html_search_regex(
0f97c9a0 50 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
6a3fa81f 51 r'<meta name="dcterms.title" content="(.*?)"/>',
0f97c9a0
PH
52 r'<h4 class="headline">(.*?)</h4>'],
53 webpage, 'title')
f9b85496 54 description = self._html_search_meta(
29546b34
PH
55 'dcterms.abstract', webpage, 'description', default=None)
56 if description is None:
57 description = self._html_search_meta(
58 'description', webpage, 'meta description')
59
60 # Thumbnail is sometimes not present.
61 # It is in the mobile version, but that seems to use a different URL
62 # structure altogether.
63 thumbnail = self._og_search_thumbnail(webpage, default=None)
64
65 media_streams = re.findall(r'''(?x)
66 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
67 "([^"]+)"''', webpage)
68
69 if media_streams:
70 QUALITIES = qualities(['lo', 'hi', 'hq'])
71 formats = []
72 for furl in set(media_streams):
73 if furl.endswith('.f4m'):
74 fid = 'f4m'
75 else:
76 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
77 fid = fid_m.group(1) if fid_m else None
78 formats.append({
79 'quality': QUALITIES(fid),
80 'format_id': fid,
81 'url': furl,
82 })
83 else: # request JSON file
84 media_info = self._download_json(
85 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
86 # The second element of the _mediaArray contains the standard http urls
87 streams = media_info['_mediaArray'][1]['_mediaStreamArray']
88 if not streams:
89 if '"fsk"' in webpage:
90 raise ExtractorError('This video is only available after 20:00')
91
92 formats = []
93 for s in streams:
94 if type(s['_stream']) == list:
95 for index, url in enumerate(s['_stream'][::-1]):
96 quality = s['_quality'] + index
97 formats.append({
98 'quality': quality,
99 'url': url,
100 'format_id': '%s-%s' % (determine_ext(url), quality)
e5da4021 101 })
29546b34 102 continue
e5da4021 103
29546b34
PH
104 format = {
105 'quality': s['_quality'],
106 'url': s['_stream'],
107 }
6a3fa81f 108
29546b34
PH
109 format['format_id'] = '%s-%s' % (
110 determine_ext(format['url']), format['quality'])
f9b85496 111
29546b34 112 formats.append(format)
f9b85496
PH
113
114 self._sort_formats(formats)
115
116 return {
117 'id': video_id,
118 'title': title,
119 'description': description,
120 'formats': formats,
121 'thumbnail': thumbnail,
122 }