]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/golem.py
[golem] Add new extractor
[yt-dlp.git] / youtube_dl / extractor / golem.py
CommitLineData
6a5af6ac
M
1# coding: utf-8
2from __future__ import unicode_literals
3
4import re
5
6from .common import InfoExtractor
7from ..utils import compat_urlparse
8
9
10class GolemIE(InfoExtractor):
11 _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/'
12 _TEST = {
13 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html',
14 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf',
15 'info_dict': {
16 'id': '14095',
17 'format_id': 'high',
18 'ext': 'mp4',
19 'title': 'iPhone 6 und 6 Plus - Test',
20 'duration': 300,
21 'filesize': 65309548,
22 }
23 }
24
25 _CONFIG = 'https://video.golem.de/xml/{}.xml'
26 _PREFIX = 'http://video.golem.de'
27
28 def _warn(self, fmt, *args):
29 self.report_warning(fmt.format(*args), self._id)
30
31 def _extract_format(self, elem):
32 format_id = elem.tag
33
34 url = elem.findtext('./url')
35 if url == '':
36 self._warn("{}: url: empty, skipping", format_id)
37 return None
38
39 fmt = {
40 'format_id': format_id,
41 'url': compat_urlparse.urljoin(self._PREFIX, url)
42 }
43
44 try:
45 _, ext = elem.findtext('./filename', '').rsplit('.', 1)
46 except ValueError:
47 self._warn('{}: ext: missing extension', format_id)
48 else:
49 fmt['ext'] = ext
50
51 filesize = elem.findtext('./filesize')
52 if filesize is not None:
53 try:
54 fmt['filesize'] = int(filesize)
55 except ValueError as e:
56 self._warn('{}: filesize: {}', format_id, e)
57
58 width = elem.get('width')
59 if width is not None:
60 try:
61 fmt['width'] = int(width)
62 except ValueError as e:
63 self._warn('{}: width: {}', format_id, e)
64
65 height = elem.get('height')
66 if height is not None:
67 try:
68 fmt['height'] = int(height)
69 except ValueError as e:
70 self._warn('{}: height: {}', format_id, e)
71
72 return fmt
73
74 def _extract_thumbnail(self, elem):
75 url = elem.findtext('./url')
76 if url == '':
77 return None
78 thumb = {
79 'url': compat_urlparse.urljoin(self._PREFIX, url)
80 }
81
82 width = elem.get('width')
83 if width is not None:
84 try:
85 thumb['width'] = int(width)
86 except ValueError as e:
87 self._warn('thumbnail: width: {}', e)
88
89 height = elem.get('height')
90 if height is not None:
91 try:
92 thumb['height'] = int(height)
93 except ValueError as e:
94 self._warn('thumbnail: height: {}', e)
95
96 return thumb
97
98 def _real_extract(self, url):
99 mobj = re.match(self._VALID_URL, url)
100 self._id = mobj.group('id')
101
102 config = self._download_xml(self._CONFIG.format(self._id), self._id)
103
104 info = {
105 'id': self._id,
106 'title': config.findtext('./title', 'golem')
107 }
108
109 formats = []
110 for e in config.findall('./*[url]'):
111 fmt = self._extract_format(e)
112 if fmt is not None:
113 formats.append(fmt)
114 self._sort_formats(formats)
115 info['formats'] = formats
116
117 thumbnails = []
118 for e in config.findall('.//teaser[url]'):
119 thumb = self._extract_thumbnail(e)
120 if thumb is not None:
121 thumbnails.append(thumb)
122 info['thumbnails'] = thumbnails
123
124 playtime = config.findtext('./playtime')
125 if playtime is not None:
126 try:
127 info['duration'] = round(float(playtime))
128 except ValueError as e:
129 self._warn('duration: {}', e)
130
131 return info