]>
Commit | Line | Data |
---|---|---|
d5822b96 | 1 | import re |
75c94812 | 2 | import json |
8de64cac | 3 | import xml.etree.ElementTree |
d5822b96 PH |
4 | |
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
d5822b96 | 7 | ExtractorError, |
df50a412 | 8 | find_xpath_attr, |
d5822b96 | 9 | unified_strdate, |
c40f5cf4 | 10 | determine_ext, |
d5822b96 PH |
11 | ) |
12 | ||
c40f5cf4 JMF |
13 | # There are different sources of video in arte.tv, the extraction process |
14 | # is different for each one. The videos usually expire in 7 days, so we can't | |
15 | # add tests. | |
16 | ||
d5822b96 | 17 | class ArteTvIE(InfoExtractor): |
8de64cac | 18 | _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' |
70c4c03c | 19 | _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' |
d5822b96 PH |
20 | _LIVE_URL = r'index-[0-9]+\.html$' |
21 | ||
22 | IE_NAME = u'arte.tv' | |
23 | ||
37b6a661 JMF |
24 | @classmethod |
25 | def suitable(cls, url): | |
c40f5cf4 | 26 | return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) |
37b6a661 | 27 | |
9b3a760b | 28 | # TODO implement Live Stream |
345b0c9b | 29 | # from ..utils import compat_urllib_parse |
9b3a760b PH |
30 | # def extractLiveStream(self, url): |
31 | # video_lang = url.split('/')[-4] | |
32 | # info = self.grep_webpage( | |
33 | # url, | |
34 | # r'src="(.*?/videothek_js.*?\.js)', | |
35 | # 0, | |
36 | # [ | |
37 | # (1, 'url', u'Invalid URL: %s' % url) | |
38 | # ] | |
39 | # ) | |
40 | # http_host = url.split('/')[2] | |
41 | # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) | |
42 | # info = self.grep_webpage( | |
43 | # next_url, | |
44 | # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + | |
45 | # '(http://.*?\.swf).*?' + | |
46 | # '(rtmp://.*?)\'', | |
47 | # re.DOTALL, | |
48 | # [ | |
49 | # (1, 'path', u'could not extract video path: %s' % url), | |
50 | # (2, 'player', u'could not extract video player: %s' % url), | |
51 | # (3, 'url', u'could not extract video url: %s' % url) | |
52 | # ] | |
53 | # ) | |
54 | # video_url = u'%s/%s' % (info.get('url'), info.get('path')) | |
d5822b96 | 55 | |
d5822b96 | 56 | def _real_extract(self, url): |
37b6a661 JMF |
57 | mobj = re.match(self._VIDEOS_URL, url) |
58 | if mobj is not None: | |
59 | id = mobj.group('id') | |
8de64cac PH |
60 | lang = mobj.group('lang') |
61 | return self._extract_video(url, id, lang) | |
d5822b96 | 62 | |
70c4c03c JMF |
63 | mobj = re.match(self._LIVEWEB_URL, url) |
64 | if mobj is not None: | |
65 | name = mobj.group('name') | |
66 | lang = mobj.group('lang') | |
67 | return self._extract_liveweb(url, name, lang) | |
68 | ||
d5822b96 | 69 | if re.search(self._LIVE_URL, video_id) is not None: |
9b3a760b PH |
70 | raise ExtractorError(u'Arte live streams are not yet supported, sorry') |
71 | # self.extractLiveStream(url) | |
72 | # return | |
75c94812 | 73 | |
8de64cac | 74 | def _extract_video(self, url, video_id, lang): |
37b6a661 | 75 | """Extract from videos.arte.tv""" |
8de64cac PH |
76 | ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') |
77 | ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') | |
78 | ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata') | |
79 | ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml) | |
df50a412 | 80 | config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) |
8de64cac PH |
81 | config_xml_url = config_node.attrib['ref'] |
82 | config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') | |
37b6a661 JMF |
83 | |
84 | video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml)) | |
85 | def _key(m): | |
86 | quality = m.group('quality') | |
87 | if quality == 'hd': | |
88 | return 2 | |
89 | else: | |
90 | return 1 | |
91 | # We pick the best quality | |
92 | video_urls = sorted(video_urls, key=_key) | |
93 | video_url = list(video_urls)[-1].group('url') | |
94 | ||
95 | title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title') | |
96 | thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>', | |
97 | config_xml, 'thumbnail') | |
98 | return {'id': video_id, | |
99 | 'title': title, | |
100 | 'thumbnail': thumbnail, | |
101 | 'url': video_url, | |
102 | 'ext': 'flv', | |
103 | } | |
70c4c03c JMF |
104 | |
105 | def _extract_liveweb(self, url, name, lang): | |
106 | """Extract form http://liveweb.arte.tv/""" | |
107 | webpage = self._download_webpage(url, name) | |
108 | video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') | |
109 | config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, | |
110 | video_id, u'Downloading information') | |
111 | config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) | |
112 | event_doc = config_doc.find('event') | |
113 | url_node = event_doc.find('video').find('urlHd') | |
114 | if url_node is None: | |
115 | url_node = video_doc.find('urlSd') | |
116 | ||
117 | return {'id': video_id, | |
118 | 'title': event_doc.find('name%s' % lang.capitalize()).text, | |
119 | 'url': url_node.text.replace('MP4', 'mp4'), | |
120 | 'ext': 'flv', | |
121 | 'thumbnail': self._og_search_thumbnail(webpage), | |
122 | } | |
c40f5cf4 JMF |
123 | |
124 | ||
125 | class ArteTVPlus7IE(InfoExtractor): | |
126 | IE_NAME = u'arte.tv:+7' | |
127 | _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' | |
128 | ||
129 | def _real_extract(self, url): | |
130 | mobj = re.match(self._VALID_URL, url) | |
131 | lang = mobj.group('lang') | |
132 | # This is not a real id, it can be for example AJT for the news | |
133 | # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal | |
134 | video_id = mobj.group('id') | |
135 | ||
136 | webpage = self._download_webpage(url, video_id) | |
137 | json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') | |
138 | ||
139 | json_info = self._download_webpage(json_url, video_id, 'Downloading info json') | |
140 | self.report_extraction(video_id) | |
141 | info = json.loads(json_info) | |
142 | player_info = info['videoJsonPlayer'] | |
143 | ||
144 | info_dict = { | |
145 | 'id': player_info['VID'], | |
146 | 'title': player_info['VTI'], | |
147 | 'description': player_info.get('VDE'), | |
148 | 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), | |
149 | 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), | |
150 | } | |
151 | ||
152 | formats = player_info['VSR'].values() | |
153 | def _match_lang(f): | |
154 | if f.get('versionCode') is None: | |
155 | return True | |
156 | # Return true if that format is in the language of the url | |
157 | if lang == 'fr': | |
158 | l = 'F' | |
159 | elif lang == 'de': | |
160 | l = 'A' | |
161 | regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] | |
162 | return any(re.match(r, f['versionCode']) for r in regexes) | |
163 | # Some formats may not be in the same language as the url | |
164 | formats = filter(_match_lang, formats) | |
165 | # Some formats use the m3u8 protocol | |
166 | formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) | |
167 | # We order the formats by quality | |
168 | formats = sorted(formats, key=lambda f: int(f.get('height',-1))) | |
169 | # Prefer videos without subtitles in the same language | |
170 | formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) | |
171 | # Pick the best quality | |
172 | def _format(format_info): | |
173 | info = { | |
174 | 'width': format_info.get('width'), | |
175 | 'height': format_info.get('height'), | |
176 | } | |
177 | if format_info['mediaType'] == u'rtmp': | |
178 | info['url'] = format_info['streamer'] | |
179 | info['play_path'] = 'mp4:' + format_info['url'] | |
180 | info['ext'] = 'flv' | |
181 | else: | |
182 | info['url'] = format_info['url'] | |
183 | info['ext'] = determine_ext(info['url']) | |
184 | return info | |
185 | info_dict['formats'] = [_format(f) for f in formats] | |
186 | # TODO: Remove when #980 has been merged | |
187 | info_dict.update(info_dict['formats'][-1]) | |
188 | ||
189 | return info_dict | |
190 | ||
191 | ||
192 | # It also uses the arte_vp_url url from the webpage to extract the information | |
193 | class ArteTVCreativeIE(ArteTVPlus7IE): | |
194 | IE_NAME = u'arte.tv:creative' | |
195 | _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' | |
196 | ||
197 | _TEST = { | |
198 | u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', | |
199 | u'file': u'050489-002.mp4', | |
200 | u'info_dict': { | |
201 | u'title': u'Agentur Amateur #2 - Corporate Design', | |
202 | }, | |
203 | } | |
204 |