]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/heise.py
Merge remote-tracking branch 'd912e3/heise'
[yt-dlp.git] / youtube_dl / extractor / heise.py
CommitLineData
0155549d
M
1# coding: utf-8
2from __future__ import unicode_literals
3
4import re
5
6from .common import InfoExtractor
7from ..utils import (
8 ExtractorError,
9 compat_urllib_parse,
10 get_meta_content,
11 parse_iso8601,
12)
13
14
15class HeiseIE(InfoExtractor):
16 _VALID_URL = (
17 r'^https?://(?:www\.)?heise\.de/video/artikel/' +
18 r'.+?(?P<id>[0-9]+)\.html$'
19 )
20 _TEST = {
21 'url': (
22 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' +
23 'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
24 ),
25 'md5': 'ffed432483e922e88545ad9f2f15d30e',
26 'info_dict': {
27 'id': '2404147',
28 'ext': 'mp4',
29 'title': (
30 "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " +
31 "Peilsender Smartphone"
32 ),
33 'format_id': 'mp4_720',
34 'timestamp': 1411812600,
35 'upload_date': '20140927',
36 }
37 }
38
39 _CONFIG = (
40 r'".+?\?sequenz=(?P<sequenz>.+?)&container=(?P<container>.+?)' +
41 r'(?:&hd=(?P<hd>.+?))?(?:&signature=(?P<signature>.+?))?&callback=\?"'
42 )
43 _PREFIX = 'http://www.heise.de/videout/info?'
44
45 def _warn(self, fmt, *args):
46 self.report_warning(fmt.format(*args), self._id)
47
48 def _parse_config_url(self, html):
49 m = re.search(self._CONFIG, html)
50 if not m:
51 raise ExtractorError('No config found')
52
53 qs = compat_urllib_parse.urlencode(dict((k, v) for k, v
54 in m.groupdict().items()
55 if v is not None))
56 return self._PREFIX + qs
57
58 def _real_extract(self, url):
59 mobj = re.match(self._VALID_URL, url)
60 self._id = mobj.group('id')
61
62 html = self._download_webpage(url, self._id)
63 config = self._download_json(self._parse_config_url(html), self._id)
64
65 info = {
66 'id': self._id
67 }
68
69 title = get_meta_content('fulltitle', html)
70 if title:
71 info['title'] = title
72 elif config.get('title'):
73 info['title'] = config['title']
74 else:
75 self._warn('title: not found')
76 info['title'] = 'heise'
77
78 if (not config.get('formats') or
79 not hasattr(config['formats'], 'items')):
80 raise ExtractorError('No formats found')
81
82 formats = []
83 for t, rs in config['formats'].items():
84 if not rs or not hasattr(rs, 'items'):
85 self._warn('formats: {0}: no resolutions', t)
86 continue
87
88 for res, obj in rs.items():
89 format_id = '{0}_{1}'.format(t, res)
90
7b751812 91 if not obj or not obj.get('url'):
0155549d
M
92 self._warn('formats: {0}: no url', format_id)
93 continue
94
95 fmt = {
96 'url': obj['url'],
97 'format_id': format_id
98 }
99 try:
100 fmt['height'] = int(res)
101 except ValueError as e:
102 self._warn('formats: {0}: height: {1}', t, e)
103
104 formats.append(fmt)
105
106 self._sort_formats(formats)
107 info['formats'] = formats
108
7b751812 109 if config.get('poster'):
0155549d
M
110 info['thumbnail'] = config['poster']
111
112 date = get_meta_content('date', html)
7b751812 113 if date:
0155549d
M
114 try:
115 info['timestamp'] = parse_iso8601(date)
116 except ValueError as e:
117 self._warn('timestamp: {0}', e)
118
119 return info