]>
Commit | Line | Data |
---|---|---|
ae287755 | 1 | from .common import InfoExtractor |
c678192a ZF |
2 | from ..utils import ( |
3 | ExtractorError, | |
4 | int_or_none, | |
bed30106 | 5 | traverse_obj, |
c678192a ZF |
6 | urlencode_postdata |
7 | ) | |
ae287755 PH |
8 | |
9 | ||
10 | class TumblrIE(InfoExtractor): | |
afca767d | 11 | _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' |
c678192a ZF |
12 | _NETRC_MACHINE = 'tumblr' |
13 | _LOGIN_URL = 'https://www.tumblr.com/login' | |
bed30106 | 14 | _OAUTH_URL = 'https://www.tumblr.com/api/v2/oauth2/token' |
62f1f950 | 15 | _TESTS = [{ |
c060b774 | 16 | 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', |
c060b774 PH |
17 | 'md5': '479bb068e5b16462f5176a6828829767', |
18 | 'info_dict': { | |
62f1f950 PP |
19 | 'id': '54196191430', |
20 | 'ext': 'mp4', | |
bed30106 | 21 | 'title': 'md5:dfac39636969fe6bf1caa2d50405f069', |
403be2ee | 22 | 'description': 'md5:390ab77358960235b6937ab3b8528956', |
bed30106 | 23 | 'uploader_id': 'tatianamaslanydaily', |
24 | 'uploader_url': 'https://tatianamaslanydaily.tumblr.com/', | |
403be2ee | 25 | 'thumbnail': r're:^https?://.*\.jpg', |
26 | 'duration': 127, | |
bed30106 | 27 | 'like_count': int, |
28 | 'repost_count': int, | |
29 | 'age_limit': 0, | |
30 | 'tags': ['Orphan Black', 'Tatiana Maslany', 'Interview', 'Video', 'OB S1 DVD Extras'], | |
6f5ac90c | 31 | } |
62f1f950 | 32 | }, { |
bed30106 | 33 | 'note': 'multiple formats', |
403be2ee | 34 | 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english', |
35 | 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68', | |
62f1f950 | 36 | 'info_dict': { |
403be2ee | 37 | 'id': '626907179849564160', |
62f1f950 | 38 | 'ext': 'mp4', |
bed30106 | 39 | 'title': 'Mona\xa0“talking” in\xa0“english”', |
403be2ee | 40 | 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c', |
bed30106 | 41 | 'uploader_id': 'maskofthedragon', |
42 | 'uploader_url': 'https://maskofthedragon.tumblr.com/', | |
403be2ee | 43 | 'thumbnail': r're:^https?://.*\.jpg', |
44 | 'duration': 7, | |
bed30106 | 45 | 'like_count': int, |
46 | 'repost_count': int, | |
47 | 'age_limit': 0, | |
48 | 'tags': 'count:19', | |
2a27e662 OA |
49 | }, |
50 | 'params': { | |
51 | 'format': 'hd', | |
52 | }, | |
bed30106 | 53 | }, { |
54 | 'note': 'non-iframe video (with related posts)', | |
55 | 'url': 'https://shieldfoss.tumblr.com/post/675519763813908480', | |
56 | 'md5': '12bdb75661ef443bffe5a4dac1dbf118', | |
57 | 'info_dict': { | |
58 | 'id': '675519763813908480', | |
59 | 'ext': 'mp4', | |
60 | 'title': 'Shieldfoss', | |
61 | 'uploader_id': 'nerviovago', | |
62 | 'uploader_url': 'https://nerviovago.tumblr.com/', | |
63 | 'thumbnail': r're:^https?://.*\.jpg', | |
64 | 'like_count': int, | |
65 | 'repost_count': int, | |
66 | 'age_limit': 0, | |
67 | 'tags': [], | |
68 | } | |
69 | }, { | |
70 | 'note': 'dashboard only (original post)', | |
71 | 'url': 'https://jujanon.tumblr.com/post/159704441298/my-baby-eating', | |
72 | 'md5': '029f7c91ab386701b211e3d494d2d95e', | |
73 | 'info_dict': { | |
74 | 'id': '159704441298', | |
75 | 'ext': 'mp4', | |
76 | 'title': 'md5:ba79365861101f4911452728d2950561', | |
77 | 'description': 'md5:773738196cea76b6996ec71e285bdabc', | |
78 | 'uploader_id': 'jujanon', | |
79 | 'uploader_url': 'https://jujanon.tumblr.com/', | |
80 | 'thumbnail': r're:^https?://.*\.jpg', | |
81 | 'like_count': int, | |
82 | 'repost_count': int, | |
83 | 'age_limit': 0, | |
84 | 'tags': ['crabs', 'my video', 'my pets'], | |
85 | } | |
86 | }, { | |
87 | 'note': 'dashboard only (reblog)', | |
88 | 'url': 'https://bartlebyshop.tumblr.com/post/180294460076/duality-of-bird', | |
89 | 'md5': '04334e7cadb1af680d162912559f51a5', | |
90 | 'info_dict': { | |
91 | 'id': '180294460076', | |
92 | 'ext': 'mp4', | |
93 | 'title': 'duality of bird', | |
94 | 'description': 'duality of bird', | |
95 | 'uploader_id': 'todaysbird', | |
96 | 'uploader_url': 'https://todaysbird.tumblr.com/', | |
97 | 'thumbnail': r're:^https?://.*\.jpg', | |
98 | 'like_count': int, | |
99 | 'repost_count': int, | |
100 | 'age_limit': 0, | |
101 | 'tags': [], | |
102 | } | |
103 | }, { | |
104 | 'note': 'dashboard only (external)', | |
105 | 'url': 'https://afloweroutofstone.tumblr.com/post/675661759168823296/the-blues-remembers-everything-the-country-forgot', | |
106 | 'info_dict': { | |
107 | 'id': 'q67_fd7b8SU', | |
108 | 'ext': 'mp4', | |
109 | 'title': 'The Blues Remembers Everything the Country Forgot', | |
110 | 'alt_title': 'The Blues Remembers Everything the Country Forgot', | |
111 | 'description': 'md5:1a6b4097e451216835a24c1023707c79', | |
112 | 'release_date': '20201224', | |
113 | 'creator': 'md5:c2239ba15430e87c3b971ba450773272', | |
114 | 'uploader': 'Moor Mother - Topic', | |
115 | 'upload_date': '20201223', | |
116 | 'uploader_id': 'UCxrMtFBRkFvQJ_vVM4il08w', | |
117 | 'uploader_url': 'http://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', | |
118 | 'thumbnail': r're:^https?://i.ytimg.com/.*', | |
119 | 'channel': 'Moor Mother - Topic', | |
120 | 'channel_id': 'UCxrMtFBRkFvQJ_vVM4il08w', | |
121 | 'channel_url': 'https://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', | |
122 | 'channel_follower_count': int, | |
123 | 'duration': 181, | |
124 | 'view_count': int, | |
125 | 'like_count': int, | |
126 | 'age_limit': 0, | |
127 | 'categories': ['Music'], | |
128 | 'tags': 'count:7', | |
129 | 'live_status': 'not_live', | |
130 | 'playable_in_embed': True, | |
131 | 'availability': 'public', | |
132 | 'track': 'The Blues Remembers Everything the Country Forgot', | |
133 | 'artist': 'md5:c2239ba15430e87c3b971ba450773272', | |
134 | 'album': 'Brass', | |
135 | 'release_year': 2020, | |
136 | }, | |
137 | 'add_ie': ['Youtube'], | |
8f947841 YCH |
138 | }, { |
139 | 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', | |
140 | 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', | |
141 | 'info_dict': { | |
142 | 'id': 'Wmur', | |
143 | 'ext': 'mp4', | |
144 | 'title': 'naked smoking & stretching', | |
145 | 'upload_date': '20150506', | |
146 | 'timestamp': 1430931613, | |
88c86d21 S |
147 | 'age_limit': 18, |
148 | 'uploader_id': '1638622', | |
149 | 'uploader': 'naked-yogi', | |
8f947841 | 150 | }, |
403be2ee | 151 | # 'add_ie': ['Vidme'], |
152 | 'skip': 'dead embedded video host' | |
bed30106 | 153 | }, { |
154 | 'url': 'https://prozdvoices.tumblr.com/post/673201091169681408/what-recording-voice-acting-sounds-like', | |
155 | 'md5': 'a0063fc8110e6c9afe44065b4ea68177', | |
156 | 'info_dict': { | |
157 | 'id': 'eomhW5MLGWA', | |
158 | 'ext': 'mp4', | |
159 | 'title': 'what recording voice acting sounds like', | |
160 | 'description': 'md5:1da3faa22d0e0b1d8b50216c284ee798', | |
161 | 'uploader': 'ProZD', | |
162 | 'upload_date': '20220112', | |
163 | 'uploader_id': 'ProZD', | |
164 | 'uploader_url': 'http://www.youtube.com/user/ProZD', | |
165 | 'thumbnail': r're:^https?://i.ytimg.com/.*', | |
166 | 'channel': 'ProZD', | |
167 | 'channel_id': 'UC6MFZAOHXlKK1FI7V0XQVeA', | |
168 | 'channel_url': 'https://www.youtube.com/channel/UC6MFZAOHXlKK1FI7V0XQVeA', | |
169 | 'channel_follower_count': int, | |
170 | 'duration': 20, | |
171 | 'view_count': int, | |
172 | 'like_count': int, | |
173 | 'age_limit': 0, | |
174 | 'categories': ['Film & Animation'], | |
175 | 'tags': [], | |
176 | 'live_status': 'not_live', | |
177 | 'playable_in_embed': True, | |
178 | 'availability': 'public', | |
179 | }, | |
180 | 'add_ie': ['Youtube'], | |
c5895d5d | 181 | }, { |
403be2ee | 182 | 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool', |
bed30106 | 183 | 'md5': '203e9eb8077e3f45bfaeb4c86c1467b8', |
c5895d5d | 184 | 'info_dict': { |
403be2ee | 185 | 'id': '87816359', |
bed30106 | 186 | 'ext': 'mov', |
403be2ee | 187 | 'title': 'Harold Ramis', |
bed30106 | 188 | 'description': 'md5:be8e68cbf56ce0785c77f0c6c6dfaf2c', |
403be2ee | 189 | 'uploader': 'Resolution Productions Group', |
190 | 'uploader_id': 'resolutionproductions', | |
191 | 'uploader_url': 'https://vimeo.com/resolutionproductions', | |
bed30106 | 192 | 'upload_date': '20140227', |
403be2ee | 193 | 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*', |
bed30106 | 194 | 'timestamp': 1393523719, |
403be2ee | 195 | 'duration': 291, |
c5895d5d YCH |
196 | }, |
197 | 'add_ie': ['Vimeo'], | |
fc27ea94 YCH |
198 | }, { |
199 | 'url': 'http://sutiblr.tumblr.com/post/139638707273', | |
200 | 'md5': '2dd184b3669e049ba40563a7d423f95c', | |
201 | 'info_dict': { | |
202 | 'id': 'ir7qBEIKqvq', | |
203 | 'ext': 'mp4', | |
204 | 'title': 'Vine by sutiblr', | |
205 | 'alt_title': 'Vine by sutiblr', | |
206 | 'uploader': 'sutiblr', | |
207 | 'uploader_id': '1198993975374495744', | |
208 | 'upload_date': '20160220', | |
209 | 'like_count': int, | |
210 | 'comment_count': int, | |
211 | 'repost_count': int, | |
403be2ee | 212 | 'thumbnail': r're:^https?://.*\.jpg', |
213 | 'timestamp': 1455940159, | |
214 | 'view_count': int, | |
fc27ea94 YCH |
215 | }, |
216 | 'add_ie': ['Vine'], | |
32d88410 | 217 | }, { |
403be2ee | 218 | 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine', |
219 | 'md5': '3c92d7c3d867f14ccbeefa2119022277', | |
32d88410 | 220 | 'info_dict': { |
403be2ee | 221 | 'id': 'nYtvtTPuTl', |
32d88410 | 222 | 'ext': 'mp4', |
403be2ee | 223 | 'title': 'Video by silbulterman', |
224 | 'description': '#maschine', | |
225 | 'uploader_id': '242859024', | |
226 | 'thumbnail': r're:^https?://.*\.jpg', | |
227 | 'timestamp': 1398801174, | |
228 | 'like_count': int, | |
229 | 'uploader': 'Sil', | |
230 | 'channel': 'silbulterman', | |
231 | 'comment_count': int, | |
232 | 'upload_date': '20140429', | |
32d88410 YCH |
233 | }, |
234 | 'add_ie': ['Instagram'], | |
62f1f950 | 235 | }] |
ae287755 | 236 | |
bed30106 | 237 | _providers = { |
238 | 'instagram': 'Instagram', | |
239 | 'vimeo': 'Vimeo', | |
240 | 'vine': 'Vine', | |
241 | 'youtube': 'Youtube', | |
242 | } | |
243 | ||
244 | _ACCESS_TOKEN = None | |
245 | ||
52efa4b3 | 246 | def _initialize_pre_login(self): |
bed30106 | 247 | login_page = self._download_webpage( |
248 | self._LOGIN_URL, None, 'Downloading login page', fatal=False) | |
249 | if login_page: | |
250 | self._ACCESS_TOKEN = self._search_regex( | |
251 | r'"API_TOKEN":\s*"(\w+)"', login_page, 'API access token', fatal=False) | |
252 | if not self._ACCESS_TOKEN: | |
253 | self.report_warning('Failed to get access token; metadata will be missing and some videos may not work') | |
254 | ||
52efa4b3 | 255 | def _perform_login(self, username, password): |
bed30106 | 256 | if not self._ACCESS_TOKEN: |
56cd31f3 S |
257 | return |
258 | ||
bed30106 | 259 | self._download_json( |
260 | self._OAUTH_URL, None, 'Logging in', | |
261 | data=urlencode_postdata({ | |
262 | 'password': password, | |
263 | 'grant_type': 'password', | |
264 | 'username': username, | |
265 | }), headers={ | |
266 | 'Content-Type': 'application/x-www-form-urlencoded', | |
267 | 'Authorization': f'Bearer {self._ACCESS_TOKEN}', | |
268 | }, | |
269 | errnote='Login failed', fatal=False) | |
c678192a | 270 | |
ae287755 | 271 | def _real_extract(self, url): |
bed30106 | 272 | blog, video_id = self._match_valid_url(url).groups() |
ae287755 | 273 | |
bed30106 | 274 | url = f'http://{blog}.tumblr.com/post/{video_id}/' |
c909e582 | 275 | webpage, urlh = self._download_webpage_handle(url, video_id) |
c5895d5d | 276 | |
7947a1f7 | 277 | redirect_url = urlh.geturl() |
97b01144 | 278 | |
bed30106 | 279 | api_only = bool(self._search_regex( |
280 | r'(tumblr.com|^)/(safe-mode|login_required|blog/view)', | |
281 | redirect_url, 'redirect', default=None)) | |
282 | ||
283 | if api_only and not self._ACCESS_TOKEN: | |
284 | raise ExtractorError('Cannot get data for dashboard-only post without access token') | |
285 | ||
286 | post_json = {} | |
287 | if self._ACCESS_TOKEN: | |
288 | post_json = traverse_obj( | |
289 | self._download_json( | |
290 | f'https://www.tumblr.com/api/v2/blog/{blog}/posts/{video_id}/permalink', | |
291 | video_id, headers={'Authorization': f'Bearer {self._ACCESS_TOKEN}'}, fatal=False), | |
292 | ('response', 'timeline', 'elements', 0)) or {} | |
293 | content_json = traverse_obj(post_json, ('trail', 0, 'content'), ('content')) or [] | |
294 | video_json = next( | |
295 | (item for item in content_json if item.get('type') == 'video'), {}) | |
296 | media_json = video_json.get('media') or {} | |
297 | if api_only and not media_json.get('url') and not video_json.get('url'): | |
298 | raise ExtractorError('Failed to find video data for dashboard-only post') | |
299 | ||
300 | if not media_json.get('url') and video_json.get('url'): | |
301 | # external video host | |
302 | return self.url_result( | |
303 | video_json['url'], | |
304 | self._providers.get(video_json.get('provider'), 'Generic')) | |
305 | ||
306 | video_url = self._og_search_video_url(webpage, default=None) | |
307 | duration = None | |
308 | formats = [] | |
309 | ||
310 | # iframes can supply duration and sometimes additional formats, so check for one | |
681b9caa | 311 | iframe_url = self._search_regex( |
bed30106 | 312 | fr'src=\'(https?://www\.tumblr\.com/video/{blog}/{video_id}/[^\']+)\'', |
c909e582 | 313 | webpage, 'iframe url', default=None) |
bed30106 | 314 | if iframe_url: |
315 | iframe = self._download_webpage( | |
316 | iframe_url, video_id, 'Downloading iframe page', | |
317 | headers={'Referer': redirect_url}) | |
318 | ||
319 | options = self._parse_json( | |
320 | self._search_regex( | |
321 | r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, | |
322 | 'hd video url', default='', group='options'), | |
323 | video_id, fatal=False) | |
324 | if options: | |
325 | duration = int_or_none(options.get('duration')) | |
326 | ||
327 | hd_url = options.get('hdUrl') | |
328 | if hd_url: | |
329 | # there are multiple formats; extract them | |
330 | # ignore other sources of width/height data as they may be wrong | |
331 | sources = [] | |
332 | sd_url = self._search_regex( | |
333 | r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, | |
334 | 'sd video url', default=None, group='url') | |
335 | if sd_url: | |
336 | sources.append((sd_url, 'sd')) | |
337 | sources.append((hd_url, 'hd')) | |
338 | ||
339 | formats = [{ | |
340 | 'url': video_url, | |
341 | 'format_id': format_id, | |
342 | 'height': int_or_none(self._search_regex( | |
343 | r'_(\d+)\.\w+$', video_url, 'height', default=None)), | |
344 | 'quality': quality, | |
345 | } for quality, (video_url, format_id) in enumerate(sources)] | |
346 | ||
347 | if not media_json.get('url') and not video_url and not iframe_url: | |
348 | # external video host (but we weren't able to figure it out from the api) | |
403be2ee | 349 | iframe_url = self._search_regex( |
350 | r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']', | |
351 | webpage, 'embed iframe url', default=None) | |
352 | return self.url_result(iframe_url or redirect_url, 'Generic') | |
353 | ||
bed30106 | 354 | formats = formats or [{ |
355 | 'url': media_json.get('url') or video_url, | |
356 | 'width': int_or_none( | |
357 | media_json.get('width') or self._og_search_property('video:width', webpage, default=None)), | |
358 | 'height': int_or_none( | |
359 | media_json.get('height') or self._og_search_property('video:height', webpage, default=None)), | |
360 | }] | |
140ac739 | 361 | self._sort_formats(formats) |
ae287755 | 362 | |
bed30106 | 363 | # the url we're extracting from might be an original post or it might be a reblog. |
364 | # if it's a reblog, og:description will be the reblogger's comment, not the uploader's. | |
365 | # content_json is always the op, so if it exists but has no text, there's no description | |
366 | if content_json: | |
367 | description = '\n\n'.join(( | |
368 | item.get('text') for item in content_json if item.get('type') == 'text')) or None | |
369 | else: | |
370 | description = self._og_search_description(webpage, default=None) | |
371 | uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name') | |
ae287755 | 372 | |
3da0e1f8 PH |
373 | return { |
374 | 'id': video_id, | |
bed30106 | 375 | 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex( |
376 | r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', webpage, 'title')), | |
377 | 'description': description, | |
378 | 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url')) | |
379 | or self._og_search_thumbnail(webpage, default=None)), | |
380 | 'uploader_id': uploader_id, | |
381 | 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None, | |
140ac739 | 382 | 'duration': duration, |
bed30106 | 383 | 'like_count': post_json.get('like_count'), |
384 | 'repost_count': post_json.get('reblog_count'), | |
385 | 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')), | |
386 | 'tags': post_json.get('tags'), | |
140ac739 | 387 | 'formats': formats, |
3da0e1f8 | 388 | } |