]>
Commit | Line | Data |
---|---|---|
0155549d M |
1 | # coding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
5 | ||
6 | from .common import InfoExtractor | |
7 | from ..utils import ( | |
8 | ExtractorError, | |
9 | compat_urllib_parse, | |
10 | get_meta_content, | |
11 | parse_iso8601, | |
12 | ) | |
13 | ||
14 | ||
15 | class HeiseIE(InfoExtractor): | |
16 | _VALID_URL = ( | |
17 | r'^https?://(?:www\.)?heise\.de/video/artikel/' + | |
18 | r'.+?(?P<id>[0-9]+)\.html$' | |
19 | ) | |
20 | _TEST = { | |
21 | 'url': ( | |
22 | 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' + | |
23 | 'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' | |
24 | ), | |
25 | 'md5': 'ffed432483e922e88545ad9f2f15d30e', | |
26 | 'info_dict': { | |
27 | 'id': '2404147', | |
28 | 'ext': 'mp4', | |
29 | 'title': ( | |
30 | "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " + | |
31 | "Peilsender Smartphone" | |
32 | ), | |
33 | 'format_id': 'mp4_720', | |
34 | 'timestamp': 1411812600, | |
35 | 'upload_date': '20140927', | |
36 | } | |
37 | } | |
38 | ||
39 | _CONFIG = ( | |
40 | r'".+?\?sequenz=(?P<sequenz>.+?)&container=(?P<container>.+?)' + | |
41 | r'(?:&hd=(?P<hd>.+?))?(?:&signature=(?P<signature>.+?))?&callback=\?"' | |
42 | ) | |
43 | _PREFIX = 'http://www.heise.de/videout/info?' | |
44 | ||
45 | def _warn(self, fmt, *args): | |
46 | self.report_warning(fmt.format(*args), self._id) | |
47 | ||
48 | def _parse_config_url(self, html): | |
49 | m = re.search(self._CONFIG, html) | |
50 | if not m: | |
51 | raise ExtractorError('No config found') | |
52 | ||
53 | qs = compat_urllib_parse.urlencode(dict((k, v) for k, v | |
54 | in m.groupdict().items() | |
55 | if v is not None)) | |
56 | return self._PREFIX + qs | |
57 | ||
58 | def _real_extract(self, url): | |
59 | mobj = re.match(self._VALID_URL, url) | |
60 | self._id = mobj.group('id') | |
61 | ||
62 | html = self._download_webpage(url, self._id) | |
63 | config = self._download_json(self._parse_config_url(html), self._id) | |
64 | ||
65 | info = { | |
66 | 'id': self._id | |
67 | } | |
68 | ||
69 | title = get_meta_content('fulltitle', html) | |
70 | if title: | |
71 | info['title'] = title | |
72 | elif config.get('title'): | |
73 | info['title'] = config['title'] | |
74 | else: | |
75 | self._warn('title: not found') | |
76 | info['title'] = 'heise' | |
77 | ||
78 | if (not config.get('formats') or | |
79 | not hasattr(config['formats'], 'items')): | |
80 | raise ExtractorError('No formats found') | |
81 | ||
82 | formats = [] | |
83 | for t, rs in config['formats'].items(): | |
84 | if not rs or not hasattr(rs, 'items'): | |
85 | self._warn('formats: {0}: no resolutions', t) | |
86 | continue | |
87 | ||
88 | for res, obj in rs.items(): | |
89 | format_id = '{0}_{1}'.format(t, res) | |
90 | ||
7b751812 | 91 | if not obj or not obj.get('url'): |
0155549d M |
92 | self._warn('formats: {0}: no url', format_id) |
93 | continue | |
94 | ||
95 | fmt = { | |
96 | 'url': obj['url'], | |
97 | 'format_id': format_id | |
98 | } | |
99 | try: | |
100 | fmt['height'] = int(res) | |
101 | except ValueError as e: | |
102 | self._warn('formats: {0}: height: {1}', t, e) | |
103 | ||
104 | formats.append(fmt) | |
105 | ||
106 | self._sort_formats(formats) | |
107 | info['formats'] = formats | |
108 | ||
7b751812 | 109 | if config.get('poster'): |
0155549d M |
110 | info['thumbnail'] = config['poster'] |
111 | ||
112 | date = get_meta_content('date', html) | |
7b751812 | 113 | if date: |
0155549d M |
114 | try: |
115 | info['timestamp'] = parse_iso8601(date) | |
116 | except ValueError as e: | |
117 | self._warn('timestamp: {0}', e) | |
118 | ||
119 | return info |