]>
Commit | Line | Data |
---|---|---|
93b22c78 | 1 | # encoding: utf-8 |
b3d14cbf PH |
2 | import json |
3 | import re | |
caeefc29 | 4 | import itertools |
b3d14cbf PH |
5 | |
6 | from .common import InfoExtractor | |
7 | from ..utils import ( | |
8 | compat_urllib_parse, | |
9 | compat_urllib_request, | |
10 | ||
11 | clean_html, | |
12 | get_element_by_attribute, | |
13 | ExtractorError, | |
55b3e45b | 14 | RegexNotFoundError, |
b3d14cbf | 15 | std_headers, |
9d4660ca | 16 | unsmuggle_url, |
b3d14cbf PH |
17 | ) |
18 | ||
bbafbe20 | 19 | |
b3d14cbf PH |
20 | class VimeoIE(InfoExtractor): |
21 | """Information extractor for vimeo.com.""" | |
22 | ||
23 | # _VALID_URL matches Vimeo URLs | |
bbafbe20 PH |
24 | _VALID_URL = r'''(?x) |
25 | (?P<proto>https?://)? | |
26 | (?:(?:www|(?P<player>player))\.)? | |
27 | vimeo(?P<pro>pro)?\.com/ | |
28 | (?:.*?/)? | |
29 | (?P<direct_link>play_redirect_hls\?clip_id=)? | |
30 | (?:videos?/)? | |
31 | (?P<id>[0-9]+) | |
32 | /?(?:[?].*)?(?:[#].*)?$''' | |
fc79158d | 33 | _NETRC_MACHINE = 'vimeo' |
b3d14cbf | 34 | IE_NAME = u'vimeo' |
a91b954b JMF |
35 | _TESTS = [ |
36 | { | |
1003d108 | 37 | u'url': u'http://vimeo.com/56015672#at=0', |
a91b954b | 38 | u'file': u'56015672.mp4', |
b9a83651 | 39 | u'md5': u'8879b6cc097e987f02484baf890129e5', |
a91b954b JMF |
40 | u'info_dict': { |
41 | u"upload_date": u"20121220", | |
42 | u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", | |
43 | u"uploader_id": u"user7108434", | |
44 | u"uploader": u"Filippo Valsorda", | |
45 | u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", | |
46 | }, | |
47 | }, | |
48 | { | |
49 | u'url': u'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', | |
50 | u'file': u'68093876.mp4', | |
51 | u'md5': u'3b5ca6aa22b60dfeeadf50b72e44ed82', | |
52 | u'note': u'Vimeo Pro video (#1197)', | |
53 | u'info_dict': { | |
54 | u'uploader_id': u'openstreetmapus', | |
55 | u'uploader': u'OpenStreetMap US', | |
56 | u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', | |
57 | }, | |
58 | }, | |
aa32314d JMF |
59 | { |
60 | u'url': u'http://player.vimeo.com/video/54469442', | |
61 | u'file': u'54469442.mp4', | |
62 | u'md5': u'619b811a4417aa4abe78dc653becf511', | |
63 | u'note': u'Videos that embed the url in the player page', | |
64 | u'info_dict': { | |
65 | u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', | |
66 | u'uploader': u'The BLN & Business of Software', | |
67 | }, | |
93b22c78 JMF |
68 | }, |
69 | { | |
70 | u'url': u'http://vimeo.com/68375962', | |
71 | u'file': u'68375962.mp4', | |
72 | u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7', | |
73 | u'note': u'Video protected with password', | |
74 | u'info_dict': { | |
75 | u'title': u'youtube-dl password protected test video', | |
76 | u'upload_date': u'20130614', | |
77 | u'uploader_id': u'user18948128', | |
78 | u'uploader': u'Jaime Marquínez Ferrándiz', | |
79 | }, | |
80 | u'params': { | |
81 | u'videopassword': u'youtube-dl', | |
82 | }, | |
83 | }, | |
a91b954b | 84 | ] |
b3d14cbf | 85 | |
fc79158d JMF |
86 | def _login(self): |
87 | (username, password) = self._get_login_info() | |
88 | if username is None: | |
89 | return | |
90 | self.report_login() | |
91 | login_url = 'https://vimeo.com/log_in' | |
92 | webpage = self._download_webpage(login_url, None, False) | |
93 | token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) | |
94 | data = compat_urllib_parse.urlencode({'email': username, | |
95 | 'password': password, | |
96 | 'action': 'login', | |
97 | 'service': 'vimeo', | |
98 | 'token': token, | |
99 | }) | |
100 | login_request = compat_urllib_request.Request(login_url, data) | |
101 | login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') | |
102 | login_request.add_header('Cookie', 'xsrft=%s' % token) | |
103 | self._download_webpage(login_request, None, False, u'Wrong login info') | |
104 | ||
b3d14cbf | 105 | def _verify_video_password(self, url, video_id, webpage): |
c6c19746 | 106 | password = self._downloader.params.get('videopassword', None) |
b3d14cbf | 107 | if password is None: |
c6c19746 | 108 | raise ExtractorError(u'This video is protected by a password, use the --video-password option') |
b3d14cbf PH |
109 | token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) |
110 | data = compat_urllib_parse.urlencode({'password': password, | |
111 | 'token': token}) | |
112 | # I didn't manage to use the password with https | |
113 | if url.startswith('https'): | |
114 | pass_url = url.replace('https','http') | |
115 | else: | |
116 | pass_url = url | |
117 | password_request = compat_urllib_request.Request(pass_url+'/password', data) | |
118 | password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') | |
119 | password_request.add_header('Cookie', 'xsrft=%s' % token) | |
120 | self._download_webpage(password_request, video_id, | |
121 | u'Verifying the password', | |
122 | u'Wrong password') | |
123 | ||
fc79158d JMF |
124 | def _real_initialize(self): |
125 | self._login() | |
126 | ||
a0088bdf | 127 | def _real_extract(self, url): |
9d4660ca PH |
128 | url, data = unsmuggle_url(url) |
129 | headers = std_headers | |
130 | if data is not None: | |
131 | headers = headers.copy() | |
132 | headers.update(data) | |
133 | ||
b3d14cbf PH |
134 | # Extract ID from URL |
135 | mobj = re.match(self._VALID_URL, url) | |
136 | if mobj is None: | |
137 | raise ExtractorError(u'Invalid URL: %s' % url) | |
138 | ||
139 | video_id = mobj.group('id') | |
9103bbc5 | 140 | if mobj.group('pro') or mobj.group('player'): |
a91b954b | 141 | url = 'http://player.vimeo.com/video/' + video_id |
9103bbc5 | 142 | else: |
b3d14cbf PH |
143 | url = 'https://vimeo.com/' + video_id |
144 | ||
145 | # Retrieve video webpage to extract further information | |
9d4660ca | 146 | request = compat_urllib_request.Request(url, None, headers) |
b3d14cbf PH |
147 | webpage = self._download_webpage(request, video_id) |
148 | ||
149 | # Now we begin extracting as much information as we can from what we | |
150 | # retrieved. First we extract the information common to all extractors, | |
151 | # and latter we extract those that are Vimeo specific. | |
152 | self.report_extraction(video_id) | |
153 | ||
154 | # Extract the config JSON | |
155 | try: | |
93b22c78 JMF |
156 | try: |
157 | config_url = self._html_search_regex( | |
158 | r' data-config-url="(.+?)"', webpage, u'config URL') | |
159 | config_json = self._download_webpage(config_url, video_id) | |
160 | config = json.loads(config_json) | |
161 | except RegexNotFoundError: | |
162 | # For pro videos or player.vimeo.com urls | |
48ad51b2 JMF |
163 | # We try to find out to which variable is assigned the config dic |
164 | m_variable_name = re.search('(\w)\.video\.id', webpage) | |
165 | if m_variable_name is not None: | |
166 | config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) | |
167 | else: | |
168 | config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] | |
169 | config = self._search_regex(config_re, webpage, u'info section', | |
170 | flags=re.DOTALL) | |
93b22c78 | 171 | config = json.loads(config) |
71907db3 | 172 | except Exception as e: |
b3d14cbf PH |
173 | if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): |
174 | raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') | |
175 | ||
93b22c78 | 176 | if re.search('<form[^>]+?id="pw_form"', webpage) is not None: |
b3d14cbf PH |
177 | self._verify_video_password(url, video_id, webpage) |
178 | return self._real_extract(url) | |
179 | else: | |
71907db3 PH |
180 | raise ExtractorError(u'Unable to extract info section', |
181 | cause=e) | |
b3d14cbf PH |
182 | |
183 | # Extract title | |
184 | video_title = config["video"]["title"] | |
185 | ||
186 | # Extract uploader and uploader_id | |
187 | video_uploader = config["video"]["owner"]["name"] | |
188 | video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None | |
189 | ||
190 | # Extract video thumbnail | |
aa32314d JMF |
191 | video_thumbnail = config["video"].get("thumbnail") |
192 | if video_thumbnail is None: | |
193 | _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] | |
b3d14cbf PH |
194 | |
195 | # Extract video description | |
9c2ade40 JMF |
196 | video_description = None |
197 | try: | |
198 | video_description = get_element_by_attribute("itemprop", "description", webpage) | |
199 | if video_description: video_description = clean_html(video_description) | |
200 | except AssertionError as err: | |
201 | # On some pages like (http://player.vimeo.com/video/54469442) the | |
202 | # html tags are not closed, python 2.6 cannot handle it | |
203 | if err.args[0] == 'we should not get here!': | |
204 | pass | |
205 | else: | |
206 | raise | |
b3d14cbf PH |
207 | |
208 | # Extract upload date | |
209 | video_upload_date = None | |
210 | mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage) | |
211 | if mobj is not None: | |
212 | video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) | |
213 | ||
4e761794 JMF |
214 | try: |
215 | view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count')) | |
216 | like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count')) | |
217 | comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count')) | |
218 | except RegexNotFoundError: | |
219 | # This info is only available in vimeo.com/{id} urls | |
220 | view_count = None | |
221 | like_count = None | |
222 | comment_count = None | |
223 | ||
b3d14cbf PH |
224 | # Vimeo specific: extract request signature and timestamp |
225 | sig = config['request']['signature'] | |
226 | timestamp = config['request']['timestamp'] | |
227 | ||
228 | # Vimeo specific: extract video codec and quality information | |
229 | # First consider quality, then codecs, then take everything | |
a6387bfd | 230 | codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')] |
a56f9de1 | 231 | files = {'hd': [], 'sd': [], 'other': []} |
aa32314d | 232 | config_files = config["video"].get("files") or config["request"].get("files") |
b3d14cbf | 233 | for codec_name, codec_extension in codecs: |
a6387bfd JE |
234 | for quality in config_files.get(codec_name, []): |
235 | format_id = '-'.join((codec_name, quality)).lower() | |
236 | key = quality if quality in files else 'other' | |
237 | video_url = None | |
238 | if isinstance(config_files[codec_name], dict): | |
239 | file_info = config_files[codec_name][quality] | |
240 | video_url = file_info.get('url') | |
b3d14cbf | 241 | else: |
a6387bfd JE |
242 | file_info = {} |
243 | if video_url is None: | |
244 | video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ | |
245 | %(video_id, sig, timestamp, quality, codec_name.upper()) | |
246 | ||
247 | files[key].append({ | |
248 | 'ext': codec_extension, | |
249 | 'url': video_url, | |
250 | 'format_id': format_id, | |
251 | 'width': file_info.get('width'), | |
252 | 'height': file_info.get('height'), | |
253 | }) | |
254 | formats = [] | |
255 | for key in ('other', 'sd', 'hd'): | |
256 | formats += files[key] | |
257 | if len(formats) == 0: | |
b3d14cbf PH |
258 | raise ExtractorError(u'No known codec found') |
259 | ||
9103bbc5 | 260 | return { |
b3d14cbf | 261 | 'id': video_id, |
b3d14cbf PH |
262 | 'uploader': video_uploader, |
263 | 'uploader_id': video_uploader_id, | |
264 | 'upload_date': video_upload_date, | |
265 | 'title': video_title, | |
b3d14cbf PH |
266 | 'thumbnail': video_thumbnail, |
267 | 'description': video_description, | |
a6387bfd | 268 | 'formats': formats, |
9103bbc5 | 269 | 'webpage_url': url, |
4e761794 JMF |
270 | 'view_count': view_count, |
271 | 'like_count': like_count, | |
272 | 'comment_count': comment_count, | |
9103bbc5 | 273 | } |
caeefc29 JMF |
274 | |
275 | ||
276 | class VimeoChannelIE(InfoExtractor): | |
277 | IE_NAME = u'vimeo:channel' | |
278 | _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)' | |
279 | _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' | |
55a10eab | 280 | _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' |
caeefc29 | 281 | |
5cc14c2f JMF |
282 | def _page_url(self, base_url, pagenum): |
283 | return '%s/videos/page:%d/' % (base_url, pagenum) | |
284 | ||
fb30ec22 JMF |
285 | def _extract_list_title(self, webpage): |
286 | return self._html_search_regex(self._TITLE_RE, webpage, u'list title') | |
287 | ||
55a10eab | 288 | def _extract_videos(self, list_id, base_url): |
caeefc29 | 289 | video_ids = [] |
caeefc29 | 290 | for pagenum in itertools.count(1): |
55a10eab | 291 | webpage = self._download_webpage( |
5cc14c2f | 292 | self._page_url(base_url, pagenum) ,list_id, |
55a10eab | 293 | u'Downloading page %s' % pagenum) |
caeefc29 JMF |
294 | video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) |
295 | if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: | |
296 | break | |
297 | ||
298 | entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') | |
299 | for video_id in video_ids] | |
caeefc29 | 300 | return {'_type': 'playlist', |
55a10eab | 301 | 'id': list_id, |
fb30ec22 | 302 | 'title': self._extract_list_title(webpage), |
caeefc29 JMF |
303 | 'entries': entries, |
304 | } | |
55a10eab JMF |
305 | |
306 | def _real_extract(self, url): | |
307 | mobj = re.match(self._VALID_URL, url) | |
308 | channel_id = mobj.group('id') | |
309 | return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id) | |
310 | ||
311 | ||
312 | class VimeoUserIE(VimeoChannelIE): | |
313 | IE_NAME = u'vimeo:user' | |
314 | _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)' | |
315 | _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' | |
316 | ||
317 | @classmethod | |
318 | def suitable(cls, url): | |
fb30ec22 | 319 | if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url) or VimeoGroupsIE.suitable(url): |
55a10eab JMF |
320 | return False |
321 | return super(VimeoUserIE, cls).suitable(url) | |
322 | ||
323 | def _real_extract(self, url): | |
324 | mobj = re.match(self._VALID_URL, url) | |
325 | name = mobj.group('name') | |
326 | return self._extract_videos(name, 'http://vimeo.com/%s' % name) | |
5cc14c2f JMF |
327 | |
328 | ||
329 | class VimeoAlbumIE(VimeoChannelIE): | |
330 | IE_NAME = u'vimeo:album' | |
331 | _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)' | |
332 | _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' | |
333 | ||
334 | def _page_url(self, base_url, pagenum): | |
335 | return '%s/page:%d/' % (base_url, pagenum) | |
336 | ||
337 | def _real_extract(self, url): | |
338 | mobj = re.match(self._VALID_URL, url) | |
339 | album_id = mobj.group('id') | |
340 | return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id) | |
fb30ec22 JMF |
341 | |
342 | ||
343 | class VimeoGroupsIE(VimeoAlbumIE): | |
344 | IE_NAME = u'vimeo:group' | |
345 | _VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)' | |
346 | ||
347 | def _extract_list_title(self, webpage): | |
348 | return self._og_search_title(webpage) | |
349 | ||
350 | def _real_extract(self, url): | |
351 | mobj = re.match(self._VALID_URL, url) | |
352 | name = mobj.group('name') | |
353 | return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name) |