]>
Commit | Line | Data |
---|---|---|
69a0c470 | 1 | # encoding: utf-8 |
3798eadc PH |
2 | from __future__ import unicode_literals |
3 | ||
d5822b96 | 4 | import re |
75c94812 | 5 | import json |
d5822b96 PH |
6 | |
7 | from .common import InfoExtractor | |
8 | from ..utils import ( | |
d5822b96 | 9 | ExtractorError, |
df50a412 | 10 | find_xpath_attr, |
d5822b96 | 11 | unified_strdate, |
c40f5cf4 | 12 | determine_ext, |
69a0c470 | 13 | get_element_by_id, |
566d4e04 | 14 | compat_str, |
56a8ab7d | 15 | get_element_by_attribute, |
d5822b96 PH |
16 | ) |
17 | ||
c40f5cf4 JMF |
18 | # There are different sources of video in arte.tv, the extraction process |
19 | # is different for each one. The videos usually expire in 7 days, so we can't | |
20 | # add tests. | |
21 | ||
d5822b96 | 22 | class ArteTvIE(InfoExtractor): |
c0ade33e | 23 | _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html' |
24 | _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' | |
d5822b96 PH |
25 | _LIVE_URL = r'index-[0-9]+\.html$' |
26 | ||
3798eadc | 27 | IE_NAME = 'arte.tv' |
d5822b96 | 28 | |
37b6a661 JMF |
29 | @classmethod |
30 | def suitable(cls, url): | |
c40f5cf4 | 31 | return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) |
37b6a661 | 32 | |
9b3a760b | 33 | # TODO implement Live Stream |
345b0c9b | 34 | # from ..utils import compat_urllib_parse |
9b3a760b PH |
35 | # def extractLiveStream(self, url): |
36 | # video_lang = url.split('/')[-4] | |
37 | # info = self.grep_webpage( | |
38 | # url, | |
39 | # r'src="(.*?/videothek_js.*?\.js)', | |
40 | # 0, | |
41 | # [ | |
3798eadc | 42 | # (1, 'url', 'Invalid URL: %s' % url) |
9b3a760b PH |
43 | # ] |
44 | # ) | |
45 | # http_host = url.split('/')[2] | |
46 | # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) | |
47 | # info = self.grep_webpage( | |
48 | # next_url, | |
49 | # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + | |
50 | # '(http://.*?\.swf).*?' + | |
51 | # '(rtmp://.*?)\'', | |
52 | # re.DOTALL, | |
53 | # [ | |
3798eadc PH |
54 | # (1, 'path', 'could not extract video path: %s' % url), |
55 | # (2, 'player', 'could not extract video player: %s' % url), | |
56 | # (3, 'url', 'could not extract video url: %s' % url) | |
9b3a760b PH |
57 | # ] |
58 | # ) | |
3798eadc | 59 | # video_url = '%s/%s' % (info.get('url'), info.get('path')) |
d5822b96 | 60 | |
d5822b96 | 61 | def _real_extract(self, url): |
37b6a661 JMF |
62 | mobj = re.match(self._VIDEOS_URL, url) |
63 | if mobj is not None: | |
64 | id = mobj.group('id') | |
8de64cac PH |
65 | lang = mobj.group('lang') |
66 | return self._extract_video(url, id, lang) | |
d5822b96 | 67 | |
70c4c03c JMF |
68 | mobj = re.match(self._LIVEWEB_URL, url) |
69 | if mobj is not None: | |
70 | name = mobj.group('name') | |
71 | lang = mobj.group('lang') | |
72 | return self._extract_liveweb(url, name, lang) | |
73 | ||
ddf49c63 | 74 | if re.search(self._LIVE_URL, url) is not None: |
9b3a760b PH |
75 | raise ExtractorError(u'Arte live streams are not yet supported, sorry') |
76 | # self.extractLiveStream(url) | |
77 | # return | |
75c94812 | 78 | |
8de64cac | 79 | def _extract_video(self, url, video_id, lang): |
37b6a661 | 80 | """Extract from videos.arte.tv""" |
8de64cac PH |
81 | ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') |
82 | ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') | |
e26f8712 | 83 | ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata') |
df50a412 | 84 | config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) |
8de64cac PH |
85 | config_xml_url = config_node.attrib['ref'] |
86 | config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') | |
37b6a661 JMF |
87 | |
88 | video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml)) | |
89 | def _key(m): | |
90 | quality = m.group('quality') | |
91 | if quality == 'hd': | |
92 | return 2 | |
93 | else: | |
94 | return 1 | |
95 | # We pick the best quality | |
96 | video_urls = sorted(video_urls, key=_key) | |
97 | video_url = list(video_urls)[-1].group('url') | |
98 | ||
99 | title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title') | |
100 | thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>', | |
101 | config_xml, 'thumbnail') | |
102 | return {'id': video_id, | |
103 | 'title': title, | |
104 | 'thumbnail': thumbnail, | |
105 | 'url': video_url, | |
106 | 'ext': 'flv', | |
107 | } | |
70c4c03c JMF |
108 | |
109 | def _extract_liveweb(self, url, name, lang): | |
110 | """Extract form http://liveweb.arte.tv/""" | |
111 | webpage = self._download_webpage(url, name) | |
3798eadc | 112 | video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, 'event id') |
e26f8712 | 113 | config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, |
3798eadc | 114 | video_id, 'Downloading information') |
70c4c03c JMF |
115 | event_doc = config_doc.find('event') |
116 | url_node = event_doc.find('video').find('urlHd') | |
117 | if url_node is None: | |
ddf49c63 | 118 | url_node = event_doc.find('urlSd') |
70c4c03c JMF |
119 | |
120 | return {'id': video_id, | |
121 | 'title': event_doc.find('name%s' % lang.capitalize()).text, | |
122 | 'url': url_node.text.replace('MP4', 'mp4'), | |
123 | 'ext': 'flv', | |
124 | 'thumbnail': self._og_search_thumbnail(webpage), | |
125 | } | |
c40f5cf4 JMF |
126 | |
127 | ||
128 | class ArteTVPlus7IE(InfoExtractor): | |
3798eadc | 129 | IE_NAME = 'arte.tv:+7' |
c40f5cf4 JMF |
130 | _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' |
131 | ||
69a0c470 JMF |
132 | @classmethod |
133 | def _extract_url_info(cls, url): | |
134 | mobj = re.match(cls._VALID_URL, url) | |
c40f5cf4 JMF |
135 | lang = mobj.group('lang') |
136 | # This is not a real id, it can be for example AJT for the news | |
137 | # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal | |
138 | video_id = mobj.group('id') | |
69a0c470 | 139 | return video_id, lang |
c40f5cf4 | 140 | |
69a0c470 JMF |
141 | def _real_extract(self, url): |
142 | video_id, lang = self._extract_url_info(url) | |
c40f5cf4 | 143 | webpage = self._download_webpage(url, video_id) |
69a0c470 JMF |
144 | return self._extract_from_webpage(webpage, video_id, lang) |
145 | ||
146 | def _extract_from_webpage(self, webpage, video_id, lang): | |
c40f5cf4 | 147 | json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') |
56a8ab7d | 148 | return self._extract_from_json_url(json_url, video_id, lang) |
c40f5cf4 | 149 | |
56a8ab7d | 150 | def _extract_from_json_url(self, json_url, video_id, lang): |
c40f5cf4 JMF |
151 | json_info = self._download_webpage(json_url, video_id, 'Downloading info json') |
152 | self.report_extraction(video_id) | |
153 | info = json.loads(json_info) | |
154 | player_info = info['videoJsonPlayer'] | |
155 | ||
156 | info_dict = { | |
157 | 'id': player_info['VID'], | |
158 | 'title': player_info['VTI'], | |
159 | 'description': player_info.get('VDE'), | |
160 | 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), | |
161 | 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), | |
162 | } | |
163 | ||
21c924f4 JMF |
164 | all_formats = player_info['VSR'].values() |
165 | # Some formats use the m3u8 protocol | |
166 | all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) | |
c40f5cf4 JMF |
167 | def _match_lang(f): |
168 | if f.get('versionCode') is None: | |
169 | return True | |
170 | # Return true if that format is in the language of the url | |
171 | if lang == 'fr': | |
172 | l = 'F' | |
173 | elif lang == 'de': | |
174 | l = 'A' | |
175 | regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] | |
176 | return any(re.match(r, f['versionCode']) for r in regexes) | |
177 | # Some formats may not be in the same language as the url | |
21c924f4 | 178 | formats = filter(_match_lang, all_formats) |
182a1078 | 179 | formats = list(formats) # in python3 filter returns an iterator |
21c924f4 JMF |
180 | if not formats: |
181 | # Some videos are only available in the 'Originalversion' | |
182 | # they aren't tagged as being in French or German | |
183 | if all(f['versionCode'] == 'VO' for f in all_formats): | |
184 | formats = all_formats | |
185 | else: | |
186 | raise ExtractorError(u'The formats list is empty') | |
f470c6c8 | 187 | |
182a1078 | 188 | if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: |
f470c6c8 JMF |
189 | def sort_key(f): |
190 | return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) | |
182a1078 | 191 | else: |
f470c6c8 JMF |
192 | def sort_key(f): |
193 | return ( | |
194 | # Sort first by quality | |
195 | int(f.get('height',-1)), | |
196 | int(f.get('bitrate',-1)), | |
197 | # The original version with subtitles has lower relevance | |
198 | re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, | |
199 | # The version with sourds/mal subtitles has also lower relevance | |
200 | re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, | |
201 | ) | |
182a1078 | 202 | formats = sorted(formats, key=sort_key) |
c40f5cf4 | 203 | def _format(format_info): |
566d4e04 JMF |
204 | quality = '' |
205 | height = format_info.get('height') | |
206 | if height is not None: | |
207 | quality = compat_str(height) | |
208 | bitrate = format_info.get('bitrate') | |
209 | if bitrate is not None: | |
210 | quality += '-%d' % bitrate | |
182a1078 | 211 | if format_info.get('versionCode') is not None: |
3798eadc | 212 | format_id = '%s-%s' % (quality, format_info['versionCode']) |
182a1078 JMF |
213 | else: |
214 | format_id = quality | |
c40f5cf4 | 215 | info = { |
182a1078 JMF |
216 | 'format_id': format_id, |
217 | 'format_note': format_info.get('versionLibelle'), | |
c40f5cf4 | 218 | 'width': format_info.get('width'), |
566d4e04 | 219 | 'height': height, |
c40f5cf4 | 220 | } |
3798eadc | 221 | if format_info['mediaType'] == 'rtmp': |
c40f5cf4 JMF |
222 | info['url'] = format_info['streamer'] |
223 | info['play_path'] = 'mp4:' + format_info['url'] | |
224 | info['ext'] = 'flv' | |
225 | else: | |
226 | info['url'] = format_info['url'] | |
227 | info['ext'] = determine_ext(info['url']) | |
228 | return info | |
229 | info_dict['formats'] = [_format(f) for f in formats] | |
c40f5cf4 JMF |
230 | |
231 | return info_dict | |
232 | ||
233 | ||
234 | # It also uses the arte_vp_url url from the webpage to extract the information | |
235 | class ArteTVCreativeIE(ArteTVPlus7IE): | |
3798eadc | 236 | IE_NAME = 'arte.tv:creative' |
c40f5cf4 JMF |
237 | _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' |
238 | ||
239 | _TEST = { | |
3798eadc PH |
240 | 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', |
241 | 'file': '050489-002.mp4', | |
242 | 'info_dict': { | |
243 | 'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design', | |
c40f5cf4 JMF |
244 | }, |
245 | } | |
246 | ||
69a0c470 JMF |
247 | |
248 | class ArteTVFutureIE(ArteTVPlus7IE): | |
3798eadc | 249 | IE_NAME = 'arte.tv:future' |
69a0c470 JMF |
250 | _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)' |
251 | ||
252 | _TEST = { | |
3798eadc PH |
253 | 'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', |
254 | 'file': '050940-003.mp4', | |
255 | 'info_dict': { | |
256 | 'title': 'Les champignons au secours de la planète', | |
69a0c470 JMF |
257 | }, |
258 | } | |
259 | ||
260 | def _real_extract(self, url): | |
261 | anchor_id, lang = self._extract_url_info(url) | |
262 | webpage = self._download_webpage(url, anchor_id) | |
263 | row = get_element_by_id(anchor_id, webpage) | |
264 | return self._extract_from_webpage(row, anchor_id, lang) | |
56a8ab7d | 265 | |
ac5118bc | 266 | |
56a8ab7d | 267 | class ArteTVDDCIE(ArteTVPlus7IE): |
3798eadc | 268 | IE_NAME = 'arte.tv:ddc' |
56a8ab7d CD |
269 | _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' |
270 | ||
56a8ab7d CD |
271 | def _real_extract(self, url): |
272 | video_id, lang = self._extract_url_info(url) | |
273 | if lang == 'folge': | |
274 | lang = 'de' | |
275 | elif lang == 'emission': | |
276 | lang = 'fr' | |
277 | webpage = self._download_webpage(url, video_id) | |
278 | scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage) | |
279 | script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url') | |
280 | javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') | |
281 | json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') | |
282 | return self._extract_from_json_url(json_url, video_id, lang) |