]>
Commit | Line | Data |
---|---|---|
dcdb292f | 1 | # coding: utf-8 |
c060b774 PH |
2 | from __future__ import unicode_literals |
3 | ||
ae287755 PH |
4 | |
5 | from .common import InfoExtractor | |
c678192a ZF |
6 | from ..utils import ( |
7 | ExtractorError, | |
8 | int_or_none, | |
c678192a ZF |
9 | urlencode_postdata |
10 | ) | |
ae287755 PH |
11 | |
12 | ||
13 | class TumblrIE(InfoExtractor): | |
afca767d | 14 | _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' |
c678192a ZF |
15 | _NETRC_MACHINE = 'tumblr' |
16 | _LOGIN_URL = 'https://www.tumblr.com/login' | |
62f1f950 | 17 | _TESTS = [{ |
c060b774 | 18 | 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', |
c060b774 PH |
19 | 'md5': '479bb068e5b16462f5176a6828829767', |
20 | 'info_dict': { | |
62f1f950 PP |
21 | 'id': '54196191430', |
22 | 'ext': 'mp4', | |
23 | 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', | |
403be2ee | 24 | 'description': 'md5:390ab77358960235b6937ab3b8528956', |
25 | 'thumbnail': r're:^https?://.*\.jpg', | |
26 | 'duration': 127, | |
6f5ac90c | 27 | } |
62f1f950 | 28 | }, { |
403be2ee | 29 | 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english', |
30 | 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68', | |
62f1f950 | 31 | 'info_dict': { |
403be2ee | 32 | 'id': '626907179849564160', |
62f1f950 | 33 | 'ext': 'mp4', |
403be2ee | 34 | 'title': 'Me roast is buggered!, Mona\xa0“talking” in\xa0“english”', |
35 | 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c', | |
36 | 'thumbnail': r're:^https?://.*\.jpg', | |
37 | 'duration': 7, | |
2a27e662 OA |
38 | }, |
39 | 'params': { | |
40 | 'format': 'hd', | |
41 | }, | |
8f947841 YCH |
42 | }, { |
43 | 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', | |
44 | 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', | |
45 | 'info_dict': { | |
46 | 'id': 'Wmur', | |
47 | 'ext': 'mp4', | |
48 | 'title': 'naked smoking & stretching', | |
49 | 'upload_date': '20150506', | |
50 | 'timestamp': 1430931613, | |
88c86d21 S |
51 | 'age_limit': 18, |
52 | 'uploader_id': '1638622', | |
53 | 'uploader': 'naked-yogi', | |
8f947841 | 54 | }, |
403be2ee | 55 | # 'add_ie': ['Vidme'], |
56 | 'skip': 'dead embedded video host' | |
c5895d5d | 57 | }, { |
403be2ee | 58 | 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool', |
59 | 'md5': '5e45724c70b748f64f5a1731ac72c84a', | |
c5895d5d | 60 | 'info_dict': { |
403be2ee | 61 | 'id': '87816359', |
c5895d5d | 62 | 'ext': 'mp4', |
403be2ee | 63 | 'title': 'Harold Ramis', |
64 | 'uploader': 'Resolution Productions Group', | |
65 | 'uploader_id': 'resolutionproductions', | |
66 | 'uploader_url': 'https://vimeo.com/resolutionproductions', | |
67 | 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*', | |
68 | 'duration': 291, | |
c5895d5d YCH |
69 | }, |
70 | 'add_ie': ['Vimeo'], | |
fc27ea94 YCH |
71 | }, { |
72 | 'url': 'http://sutiblr.tumblr.com/post/139638707273', | |
73 | 'md5': '2dd184b3669e049ba40563a7d423f95c', | |
74 | 'info_dict': { | |
75 | 'id': 'ir7qBEIKqvq', | |
76 | 'ext': 'mp4', | |
77 | 'title': 'Vine by sutiblr', | |
78 | 'alt_title': 'Vine by sutiblr', | |
79 | 'uploader': 'sutiblr', | |
80 | 'uploader_id': '1198993975374495744', | |
81 | 'upload_date': '20160220', | |
82 | 'like_count': int, | |
83 | 'comment_count': int, | |
84 | 'repost_count': int, | |
403be2ee | 85 | 'thumbnail': r're:^https?://.*\.jpg', |
86 | 'timestamp': 1455940159, | |
87 | 'view_count': int, | |
fc27ea94 YCH |
88 | }, |
89 | 'add_ie': ['Vine'], | |
32d88410 | 90 | }, { |
403be2ee | 91 | 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine', |
92 | 'md5': '3c92d7c3d867f14ccbeefa2119022277', | |
32d88410 | 93 | 'info_dict': { |
403be2ee | 94 | 'id': 'nYtvtTPuTl', |
32d88410 | 95 | 'ext': 'mp4', |
403be2ee | 96 | 'title': 'Video by silbulterman', |
97 | 'description': '#maschine', | |
98 | 'uploader_id': '242859024', | |
99 | 'thumbnail': r're:^https?://.*\.jpg', | |
100 | 'timestamp': 1398801174, | |
101 | 'like_count': int, | |
102 | 'uploader': 'Sil', | |
103 | 'channel': 'silbulterman', | |
104 | 'comment_count': int, | |
105 | 'upload_date': '20140429', | |
32d88410 YCH |
106 | }, |
107 | 'add_ie': ['Instagram'], | |
62f1f950 | 108 | }] |
ae287755 | 109 | |
c678192a ZF |
110 | def _real_initialize(self): |
111 | self._login() | |
112 | ||
113 | def _login(self): | |
68217024 | 114 | username, password = self._get_login_info() |
c678192a ZF |
115 | if username is None: |
116 | return | |
56cd31f3 S |
117 | |
118 | login_page = self._download_webpage( | |
119 | self._LOGIN_URL, None, 'Downloading login page') | |
120 | ||
121 | login_form = self._hidden_inputs(login_page) | |
122 | login_form.update({ | |
c678192a ZF |
123 | 'user[email]': username, |
124 | 'user[password]': password | |
125 | }) | |
56cd31f3 S |
126 | |
127 | response, urlh = self._download_webpage_handle( | |
128 | self._LOGIN_URL, None, 'Logging in', | |
129 | data=urlencode_postdata(login_form), headers={ | |
c678192a | 130 | 'Content-Type': 'application/x-www-form-urlencoded', |
56cd31f3 S |
131 | 'Referer': self._LOGIN_URL, |
132 | }) | |
c678192a | 133 | |
56cd31f3 S |
134 | # Successful login |
135 | if '/dashboard' in urlh.geturl(): | |
136 | return | |
137 | ||
138 | login_errors = self._parse_json( | |
139 | self._search_regex( | |
140 | r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, | |
141 | 'login errors', default='[]'), | |
142 | None, fatal=False) | |
c678192a | 143 | if login_errors: |
56cd31f3 S |
144 | raise ExtractorError( |
145 | 'Unable to login: %s' % login_errors[0], expected=True) | |
146 | ||
147 | self.report_warning('Login has probably failed') | |
c678192a | 148 | |
ae287755 | 149 | def _real_extract(self, url): |
5ad28e7f | 150 | m_url = self._match_valid_url(url) |
ae287755 PH |
151 | video_id = m_url.group('id') |
152 | blog = m_url.group('blog_name') | |
153 | ||
154 | url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) | |
c909e582 | 155 | webpage, urlh = self._download_webpage_handle(url, video_id) |
c5895d5d | 156 | |
7947a1f7 | 157 | redirect_url = urlh.geturl() |
97b01144 S |
158 | if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): |
159 | raise ExtractorError( | |
160 | 'This Tumblr may contain sensitive media. ' | |
161 | 'Disable safe mode in your account settings ' | |
162 | 'at https://www.tumblr.com/settings/account#safe_mode', | |
163 | expected=True) | |
164 | ||
681b9caa JMF |
165 | iframe_url = self._search_regex( |
166 | r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', | |
c909e582 YCH |
167 | webpage, 'iframe url', default=None) |
168 | if iframe_url is None: | |
403be2ee | 169 | iframe_url = self._search_regex( |
170 | r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']', | |
171 | webpage, 'embed iframe url', default=None) | |
172 | return self.url_result(iframe_url or redirect_url, 'Generic') | |
173 | ||
174 | iframe = self._download_webpage( | |
175 | iframe_url, video_id, 'Downloading iframe page', | |
176 | headers={'Referer': redirect_url}) | |
140ac739 S |
177 | |
178 | duration = None | |
179 | sources = [] | |
180 | ||
181 | sd_url = self._search_regex( | |
182 | r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, | |
183 | 'sd video url', default=None, group='url') | |
184 | if sd_url: | |
185 | sources.append((sd_url, 'sd')) | |
186 | ||
187 | options = self._parse_json( | |
188 | self._search_regex( | |
189 | r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, | |
190 | 'hd video url', default='', group='options'), | |
191 | video_id, fatal=False) | |
192 | if options: | |
193 | duration = int_or_none(options.get('duration')) | |
194 | hd_url = options.get('hdUrl') | |
195 | if hd_url: | |
196 | sources.append((hd_url, 'hd')) | |
2a27e662 | 197 | |
140ac739 S |
198 | formats = [{ |
199 | 'url': video_url, | |
2a27e662 | 200 | 'ext': 'mp4', |
140ac739 S |
201 | 'format_id': format_id, |
202 | 'height': int_or_none(self._search_regex( | |
203 | r'/(\d{3,4})$', video_url, 'height', default=None)), | |
204 | 'quality': quality, | |
205 | } for quality, (video_url, format_id) in enumerate(sources)] | |
2a27e662 | 206 | |
140ac739 | 207 | self._sort_formats(formats) |
ae287755 PH |
208 | |
209 | # The only place where you can get a title, it's not complete, | |
210 | # but searching in other places doesn't work for all videos | |
3da0e1f8 PH |
211 | video_title = self._html_search_regex( |
212 | r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', | |
213 | webpage, 'title') | |
ae287755 | 214 | |
3da0e1f8 PH |
215 | return { |
216 | 'id': video_id, | |
681b9caa | 217 | 'title': video_title, |
ae849ca1 YCH |
218 | 'description': self._og_search_description(webpage, default=None), |
219 | 'thumbnail': self._og_search_thumbnail(webpage, default=None), | |
140ac739 S |
220 | 'duration': duration, |
221 | 'formats': formats, | |
3da0e1f8 | 222 | } |