]>
Commit | Line | Data |
---|---|---|
69a0c470 | 1 | # encoding: utf-8 |
d5822b96 | 2 | import re |
75c94812 | 3 | import json |
d5822b96 PH |
4 | |
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
d5822b96 | 7 | ExtractorError, |
df50a412 | 8 | find_xpath_attr, |
d5822b96 | 9 | unified_strdate, |
c40f5cf4 | 10 | determine_ext, |
69a0c470 | 11 | get_element_by_id, |
566d4e04 | 12 | compat_str, |
56a8ab7d | 13 | get_element_by_attribute, |
d5822b96 PH |
14 | ) |
15 | ||
c40f5cf4 JMF |
16 | # There are different sources of video in arte.tv, the extraction process |
17 | # is different for each one. The videos usually expire in 7 days, so we can't | |
18 | # add tests. | |
19 | ||
d5822b96 | 20 | class ArteTvIE(InfoExtractor): |
c0ade33e | 21 | _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html' |
22 | _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' | |
d5822b96 PH |
23 | _LIVE_URL = r'index-[0-9]+\.html$' |
24 | ||
25 | IE_NAME = u'arte.tv' | |
26 | ||
37b6a661 JMF |
27 | @classmethod |
28 | def suitable(cls, url): | |
c40f5cf4 | 29 | return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) |
37b6a661 | 30 | |
9b3a760b | 31 | # TODO implement Live Stream |
345b0c9b | 32 | # from ..utils import compat_urllib_parse |
9b3a760b PH |
33 | # def extractLiveStream(self, url): |
34 | # video_lang = url.split('/')[-4] | |
35 | # info = self.grep_webpage( | |
36 | # url, | |
37 | # r'src="(.*?/videothek_js.*?\.js)', | |
38 | # 0, | |
39 | # [ | |
40 | # (1, 'url', u'Invalid URL: %s' % url) | |
41 | # ] | |
42 | # ) | |
43 | # http_host = url.split('/')[2] | |
44 | # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) | |
45 | # info = self.grep_webpage( | |
46 | # next_url, | |
47 | # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + | |
48 | # '(http://.*?\.swf).*?' + | |
49 | # '(rtmp://.*?)\'', | |
50 | # re.DOTALL, | |
51 | # [ | |
52 | # (1, 'path', u'could not extract video path: %s' % url), | |
53 | # (2, 'player', u'could not extract video player: %s' % url), | |
54 | # (3, 'url', u'could not extract video url: %s' % url) | |
55 | # ] | |
56 | # ) | |
57 | # video_url = u'%s/%s' % (info.get('url'), info.get('path')) | |
d5822b96 | 58 | |
d5822b96 | 59 | def _real_extract(self, url): |
37b6a661 JMF |
60 | mobj = re.match(self._VIDEOS_URL, url) |
61 | if mobj is not None: | |
62 | id = mobj.group('id') | |
8de64cac PH |
63 | lang = mobj.group('lang') |
64 | return self._extract_video(url, id, lang) | |
d5822b96 | 65 | |
70c4c03c JMF |
66 | mobj = re.match(self._LIVEWEB_URL, url) |
67 | if mobj is not None: | |
68 | name = mobj.group('name') | |
69 | lang = mobj.group('lang') | |
70 | return self._extract_liveweb(url, name, lang) | |
71 | ||
ddf49c63 | 72 | if re.search(self._LIVE_URL, url) is not None: |
9b3a760b PH |
73 | raise ExtractorError(u'Arte live streams are not yet supported, sorry') |
74 | # self.extractLiveStream(url) | |
75 | # return | |
75c94812 | 76 | |
8de64cac | 77 | def _extract_video(self, url, video_id, lang): |
37b6a661 | 78 | """Extract from videos.arte.tv""" |
8de64cac PH |
79 | ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') |
80 | ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') | |
e26f8712 | 81 | ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata') |
df50a412 | 82 | config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) |
8de64cac PH |
83 | config_xml_url = config_node.attrib['ref'] |
84 | config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') | |
37b6a661 JMF |
85 | |
86 | video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml)) | |
87 | def _key(m): | |
88 | quality = m.group('quality') | |
89 | if quality == 'hd': | |
90 | return 2 | |
91 | else: | |
92 | return 1 | |
93 | # We pick the best quality | |
94 | video_urls = sorted(video_urls, key=_key) | |
95 | video_url = list(video_urls)[-1].group('url') | |
96 | ||
97 | title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title') | |
98 | thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>', | |
99 | config_xml, 'thumbnail') | |
100 | return {'id': video_id, | |
101 | 'title': title, | |
102 | 'thumbnail': thumbnail, | |
103 | 'url': video_url, | |
104 | 'ext': 'flv', | |
105 | } | |
70c4c03c JMF |
106 | |
107 | def _extract_liveweb(self, url, name, lang): | |
108 | """Extract form http://liveweb.arte.tv/""" | |
109 | webpage = self._download_webpage(url, name) | |
110 | video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') | |
e26f8712 | 111 | config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, |
70c4c03c | 112 | video_id, u'Downloading information') |
70c4c03c JMF |
113 | event_doc = config_doc.find('event') |
114 | url_node = event_doc.find('video').find('urlHd') | |
115 | if url_node is None: | |
ddf49c63 | 116 | url_node = event_doc.find('urlSd') |
70c4c03c JMF |
117 | |
118 | return {'id': video_id, | |
119 | 'title': event_doc.find('name%s' % lang.capitalize()).text, | |
120 | 'url': url_node.text.replace('MP4', 'mp4'), | |
121 | 'ext': 'flv', | |
122 | 'thumbnail': self._og_search_thumbnail(webpage), | |
123 | } | |
c40f5cf4 JMF |
124 | |
125 | ||
126 | class ArteTVPlus7IE(InfoExtractor): | |
127 | IE_NAME = u'arte.tv:+7' | |
128 | _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' | |
129 | ||
69a0c470 JMF |
130 | @classmethod |
131 | def _extract_url_info(cls, url): | |
132 | mobj = re.match(cls._VALID_URL, url) | |
c40f5cf4 JMF |
133 | lang = mobj.group('lang') |
134 | # This is not a real id, it can be for example AJT for the news | |
135 | # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal | |
136 | video_id = mobj.group('id') | |
69a0c470 | 137 | return video_id, lang |
c40f5cf4 | 138 | |
69a0c470 JMF |
139 | def _real_extract(self, url): |
140 | video_id, lang = self._extract_url_info(url) | |
c40f5cf4 | 141 | webpage = self._download_webpage(url, video_id) |
69a0c470 JMF |
142 | return self._extract_from_webpage(webpage, video_id, lang) |
143 | ||
144 | def _extract_from_webpage(self, webpage, video_id, lang): | |
c40f5cf4 | 145 | json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') |
56a8ab7d | 146 | return self._extract_from_json_url(json_url, video_id, lang) |
c40f5cf4 | 147 | |
56a8ab7d | 148 | def _extract_from_json_url(self, json_url, video_id, lang): |
c40f5cf4 JMF |
149 | json_info = self._download_webpage(json_url, video_id, 'Downloading info json') |
150 | self.report_extraction(video_id) | |
151 | info = json.loads(json_info) | |
152 | player_info = info['videoJsonPlayer'] | |
153 | ||
154 | info_dict = { | |
155 | 'id': player_info['VID'], | |
156 | 'title': player_info['VTI'], | |
157 | 'description': player_info.get('VDE'), | |
158 | 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), | |
159 | 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), | |
160 | } | |
161 | ||
21c924f4 JMF |
162 | all_formats = player_info['VSR'].values() |
163 | # Some formats use the m3u8 protocol | |
164 | all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) | |
c40f5cf4 JMF |
165 | def _match_lang(f): |
166 | if f.get('versionCode') is None: | |
167 | return True | |
168 | # Return true if that format is in the language of the url | |
169 | if lang == 'fr': | |
170 | l = 'F' | |
171 | elif lang == 'de': | |
172 | l = 'A' | |
173 | regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] | |
174 | return any(re.match(r, f['versionCode']) for r in regexes) | |
175 | # Some formats may not be in the same language as the url | |
21c924f4 | 176 | formats = filter(_match_lang, all_formats) |
182a1078 | 177 | formats = list(formats) # in python3 filter returns an iterator |
21c924f4 JMF |
178 | if not formats: |
179 | # Some videos are only available in the 'Originalversion' | |
180 | # they aren't tagged as being in French or German | |
181 | if all(f['versionCode'] == 'VO' for f in all_formats): | |
182 | formats = all_formats | |
183 | else: | |
184 | raise ExtractorError(u'The formats list is empty') | |
f470c6c8 | 185 | |
182a1078 | 186 | if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: |
f470c6c8 JMF |
187 | def sort_key(f): |
188 | return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) | |
182a1078 | 189 | else: |
f470c6c8 JMF |
190 | def sort_key(f): |
191 | return ( | |
192 | # Sort first by quality | |
193 | int(f.get('height',-1)), | |
194 | int(f.get('bitrate',-1)), | |
195 | # The original version with subtitles has lower relevance | |
196 | re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, | |
197 | # The version with sourds/mal subtitles has also lower relevance | |
198 | re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, | |
199 | ) | |
182a1078 | 200 | formats = sorted(formats, key=sort_key) |
c40f5cf4 | 201 | def _format(format_info): |
566d4e04 JMF |
202 | quality = '' |
203 | height = format_info.get('height') | |
204 | if height is not None: | |
205 | quality = compat_str(height) | |
206 | bitrate = format_info.get('bitrate') | |
207 | if bitrate is not None: | |
208 | quality += '-%d' % bitrate | |
182a1078 JMF |
209 | if format_info.get('versionCode') is not None: |
210 | format_id = u'%s-%s' % (quality, format_info['versionCode']) | |
211 | else: | |
212 | format_id = quality | |
c40f5cf4 | 213 | info = { |
182a1078 JMF |
214 | 'format_id': format_id, |
215 | 'format_note': format_info.get('versionLibelle'), | |
c40f5cf4 | 216 | 'width': format_info.get('width'), |
566d4e04 | 217 | 'height': height, |
c40f5cf4 JMF |
218 | } |
219 | if format_info['mediaType'] == u'rtmp': | |
220 | info['url'] = format_info['streamer'] | |
221 | info['play_path'] = 'mp4:' + format_info['url'] | |
222 | info['ext'] = 'flv' | |
223 | else: | |
224 | info['url'] = format_info['url'] | |
225 | info['ext'] = determine_ext(info['url']) | |
226 | return info | |
227 | info_dict['formats'] = [_format(f) for f in formats] | |
c40f5cf4 JMF |
228 | |
229 | return info_dict | |
230 | ||
231 | ||
232 | # It also uses the arte_vp_url url from the webpage to extract the information | |
233 | class ArteTVCreativeIE(ArteTVPlus7IE): | |
234 | IE_NAME = u'arte.tv:creative' | |
235 | _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' | |
236 | ||
237 | _TEST = { | |
238 | u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', | |
239 | u'file': u'050489-002.mp4', | |
240 | u'info_dict': { | |
b028e961 | 241 | u'title': u'Agentur Amateur / Agence Amateur #2 : Corporate Design', |
c40f5cf4 JMF |
242 | }, |
243 | } | |
244 | ||
69a0c470 JMF |
245 | |
246 | class ArteTVFutureIE(ArteTVPlus7IE): | |
247 | IE_NAME = u'arte.tv:future' | |
248 | _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)' | |
249 | ||
250 | _TEST = { | |
251 | u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', | |
252 | u'file': u'050940-003.mp4', | |
253 | u'info_dict': { | |
254 | u'title': u'Les champignons au secours de la planète', | |
255 | }, | |
256 | } | |
257 | ||
258 | def _real_extract(self, url): | |
259 | anchor_id, lang = self._extract_url_info(url) | |
260 | webpage = self._download_webpage(url, anchor_id) | |
261 | row = get_element_by_id(anchor_id, webpage) | |
262 | return self._extract_from_webpage(row, anchor_id, lang) | |
56a8ab7d | 263 | |
ac5118bc | 264 | |
56a8ab7d CD |
265 | class ArteTVDDCIE(ArteTVPlus7IE): |
266 | IE_NAME = u'arte.tv:ddc' | |
267 | _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' | |
268 | ||
56a8ab7d CD |
269 | def _real_extract(self, url): |
270 | video_id, lang = self._extract_url_info(url) | |
271 | if lang == 'folge': | |
272 | lang = 'de' | |
273 | elif lang == 'emission': | |
274 | lang = 'fr' | |
275 | webpage = self._download_webpage(url, video_id) | |
276 | scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage) | |
277 | script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url') | |
278 | javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') | |
279 | json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') | |
280 | return self._extract_from_json_url(json_url, video_id, lang) |