]>
Commit | Line | Data |
---|---|---|
d5822b96 | 1 | import re |
75c94812 | 2 | import json |
8de64cac | 3 | import xml.etree.ElementTree |
d5822b96 PH |
4 | |
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
d5822b96 | 7 | ExtractorError, |
df50a412 | 8 | find_xpath_attr, |
d5822b96 PH |
9 | unified_strdate, |
10 | ) | |
11 | ||
12 | class ArteTvIE(InfoExtractor): | |
37b6a661 JMF |
13 | """ |
14 | There are two sources of video in arte.tv: videos.arte.tv and | |
15 | www.arte.tv/guide, the extraction process is different for each one. | |
16 | The videos expire in 7 days, so we can't add tests. | |
17 | """ | |
9826925a | 18 | _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' |
8de64cac | 19 | _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' |
70c4c03c | 20 | _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' |
d5822b96 PH |
21 | _LIVE_URL = r'index-[0-9]+\.html$' |
22 | ||
23 | IE_NAME = u'arte.tv' | |
24 | ||
37b6a661 JMF |
25 | @classmethod |
26 | def suitable(cls, url): | |
70c4c03c | 27 | return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) |
37b6a661 | 28 | |
9b3a760b | 29 | # TODO implement Live Stream |
345b0c9b | 30 | # from ..utils import compat_urllib_parse |
9b3a760b PH |
31 | # def extractLiveStream(self, url): |
32 | # video_lang = url.split('/')[-4] | |
33 | # info = self.grep_webpage( | |
34 | # url, | |
35 | # r'src="(.*?/videothek_js.*?\.js)', | |
36 | # 0, | |
37 | # [ | |
38 | # (1, 'url', u'Invalid URL: %s' % url) | |
39 | # ] | |
40 | # ) | |
41 | # http_host = url.split('/')[2] | |
42 | # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) | |
43 | # info = self.grep_webpage( | |
44 | # next_url, | |
45 | # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + | |
46 | # '(http://.*?\.swf).*?' + | |
47 | # '(rtmp://.*?)\'', | |
48 | # re.DOTALL, | |
49 | # [ | |
50 | # (1, 'path', u'could not extract video path: %s' % url), | |
51 | # (2, 'player', u'could not extract video player: %s' % url), | |
52 | # (3, 'url', u'could not extract video url: %s' % url) | |
53 | # ] | |
54 | # ) | |
55 | # video_url = u'%s/%s' % (info.get('url'), info.get('path')) | |
d5822b96 | 56 | |
d5822b96 | 57 | def _real_extract(self, url): |
37b6a661 JMF |
58 | mobj = re.match(self._EMISSION_URL, url) |
59 | if mobj is not None: | |
9826925a | 60 | lang = mobj.group('lang') |
37b6a661 JMF |
61 | # This is not a real id, it can be for example AJT for the news |
62 | # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal | |
63 | video_id = mobj.group('id') | |
9826925a | 64 | return self._extract_emission(url, video_id, lang) |
37b6a661 JMF |
65 | |
66 | mobj = re.match(self._VIDEOS_URL, url) | |
67 | if mobj is not None: | |
68 | id = mobj.group('id') | |
8de64cac PH |
69 | lang = mobj.group('lang') |
70 | return self._extract_video(url, id, lang) | |
d5822b96 | 71 | |
70c4c03c JMF |
72 | mobj = re.match(self._LIVEWEB_URL, url) |
73 | if mobj is not None: | |
74 | name = mobj.group('name') | |
75 | lang = mobj.group('lang') | |
76 | return self._extract_liveweb(url, name, lang) | |
77 | ||
d5822b96 | 78 | if re.search(self._LIVE_URL, video_id) is not None: |
9b3a760b PH |
79 | raise ExtractorError(u'Arte live streams are not yet supported, sorry') |
80 | # self.extractLiveStream(url) | |
81 | # return | |
75c94812 | 82 | |
9826925a | 83 | def _extract_emission(self, url, video_id, lang): |
37b6a661 | 84 | """Extract from www.arte.tv/guide""" |
b2270603 PH |
85 | webpage = self._download_webpage(url, video_id) |
86 | json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') | |
75c94812 JMF |
87 | |
88 | json_info = self._download_webpage(json_url, video_id, 'Downloading info json') | |
89 | self.report_extraction(video_id) | |
90 | info = json.loads(json_info) | |
91 | player_info = info['videoJsonPlayer'] | |
92 | ||
93 | info_dict = {'id': player_info['VID'], | |
94 | 'title': player_info['VTI'], | |
f5791ed1 | 95 | 'description': player_info.get('VDE'), |
75c94812 JMF |
96 | 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), |
97 | 'thumbnail': player_info['programImage'], | |
37b6a661 | 98 | 'ext': 'flv', |
75c94812 JMF |
99 | } |
100 | ||
101 | formats = player_info['VSR'].values() | |
9826925a JMF |
102 | def _match_lang(f): |
103 | # Return true if that format is in the language of the url | |
104 | if lang == 'fr': | |
105 | l = 'F' | |
106 | elif lang == 'de': | |
107 | l = 'A' | |
e3f4593e | 108 | regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] |
9826925a JMF |
109 | return any(re.match(r, f['versionCode']) for r in regexes) |
110 | # Some formats may not be in the same language as the url | |
111 | formats = filter(_match_lang, formats) | |
75c94812 JMF |
112 | # We order the formats by quality |
113 | formats = sorted(formats, key=lambda f: int(f['height'])) | |
f5791ed1 JMF |
114 | # Prefer videos without subtitles in the same language |
115 | formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) | |
75c94812 JMF |
116 | # Pick the best quality |
117 | format_info = formats[-1] | |
118 | if format_info['mediaType'] == u'rtmp': | |
119 | info_dict['url'] = format_info['streamer'] | |
120 | info_dict['play_path'] = 'mp4:' + format_info['url'] | |
d5822b96 | 121 | else: |
75c94812 | 122 | info_dict['url'] = format_info['url'] |
d5822b96 | 123 | |
75c94812 | 124 | return info_dict |
37b6a661 | 125 | |
8de64cac | 126 | def _extract_video(self, url, video_id, lang): |
37b6a661 | 127 | """Extract from videos.arte.tv""" |
8de64cac PH |
128 | ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') |
129 | ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') | |
130 | ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata') | |
131 | ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml) | |
df50a412 | 132 | config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) |
8de64cac PH |
133 | config_xml_url = config_node.attrib['ref'] |
134 | config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') | |
37b6a661 JMF |
135 | |
136 | video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml)) | |
137 | def _key(m): | |
138 | quality = m.group('quality') | |
139 | if quality == 'hd': | |
140 | return 2 | |
141 | else: | |
142 | return 1 | |
143 | # We pick the best quality | |
144 | video_urls = sorted(video_urls, key=_key) | |
145 | video_url = list(video_urls)[-1].group('url') | |
146 | ||
147 | title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title') | |
148 | thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>', | |
149 | config_xml, 'thumbnail') | |
150 | return {'id': video_id, | |
151 | 'title': title, | |
152 | 'thumbnail': thumbnail, | |
153 | 'url': video_url, | |
154 | 'ext': 'flv', | |
155 | } | |
70c4c03c JMF |
156 | |
157 | def _extract_liveweb(self, url, name, lang): | |
158 | """Extract form http://liveweb.arte.tv/""" | |
159 | webpage = self._download_webpage(url, name) | |
160 | video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') | |
161 | config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, | |
162 | video_id, u'Downloading information') | |
163 | config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) | |
164 | event_doc = config_doc.find('event') | |
165 | url_node = event_doc.find('video').find('urlHd') | |
166 | if url_node is None: | |
167 | url_node = video_doc.find('urlSd') | |
168 | ||
169 | return {'id': video_id, | |
170 | 'title': event_doc.find('name%s' % lang.capitalize()).text, | |
171 | 'url': url_node.text.replace('MP4', 'mp4'), | |
172 | 'ext': 'flv', | |
173 | 'thumbnail': self._og_search_thumbnail(webpage), | |
174 | } |