]>
jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/tumblr.py
2 from __future__
import unicode_literals
5 from .common
import InfoExtractor
13 class TumblrIE(InfoExtractor
):
14 _VALID_URL
= r
'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])'
15 _NETRC_MACHINE
= 'tumblr'
16 _LOGIN_URL
= 'https://www.tumblr.com/login'
18 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
19 'md5': '479bb068e5b16462f5176a6828829767',
23 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...',
24 'description': 'md5:390ab77358960235b6937ab3b8528956',
25 'thumbnail': r
're:^https?://.*\.jpg',
29 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english',
30 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68',
32 'id': '626907179849564160',
34 'title': 'Me roast is buggered!, Mona\xa0“talking” in\xa0“english”',
35 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c',
36 'thumbnail': r
're:^https?://.*\.jpg',
43 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
44 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
48 'title': 'naked smoking & stretching',
49 'upload_date': '20150506',
50 'timestamp': 1430931613,
52 'uploader_id': '1638622',
53 'uploader': 'naked-yogi',
55 # 'add_ie': ['Vidme'],
56 'skip': 'dead embedded video host'
58 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool',
59 'md5': '5e45724c70b748f64f5a1731ac72c84a',
63 'title': 'Harold Ramis',
64 'uploader': 'Resolution Productions Group',
65 'uploader_id': 'resolutionproductions',
66 'uploader_url': 'https://vimeo.com/resolutionproductions',
67 'thumbnail': r
're:^https?://i.vimeocdn.com/video/.*',
72 'url': 'http://sutiblr.tumblr.com/post/139638707273',
73 'md5': '2dd184b3669e049ba40563a7d423f95c',
77 'title': 'Vine by sutiblr',
78 'alt_title': 'Vine by sutiblr',
79 'uploader': 'sutiblr',
80 'uploader_id': '1198993975374495744',
81 'upload_date': '20160220',
85 'thumbnail': r
're:^https?://.*\.jpg',
86 'timestamp': 1455940159,
91 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine',
92 'md5': '3c92d7c3d867f14ccbeefa2119022277',
96 'title': 'Video by silbulterman',
97 'description': '#maschine',
98 'uploader_id': '242859024',
99 'thumbnail': r
're:^https?://.*\.jpg',
100 'timestamp': 1398801174,
103 'channel': 'silbulterman',
104 'comment_count': int,
105 'upload_date': '20140429',
107 'add_ie': ['Instagram'],
110 def _real_initialize(self
):
114 username
, password
= self
._get
_login
_info
()
118 login_page
= self
._download
_webpage
(
119 self
._LOGIN
_URL
, None, 'Downloading login page')
121 login_form
= self
._hidden
_inputs
(login_page
)
123 'user[email]': username
,
124 'user[password]': password
127 response
, urlh
= self
._download
_webpage
_handle
(
128 self
._LOGIN
_URL
, None, 'Logging in',
129 data
=urlencode_postdata(login_form
), headers
={
130 'Content-Type': 'application/x-www-form-urlencoded',
131 'Referer': self
._LOGIN
_URL
,
135 if '/dashboard' in urlh
.geturl():
138 login_errors
= self
._parse
_json
(
140 r
'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response
,
141 'login errors', default
='[]'),
144 raise ExtractorError(
145 'Unable to login: %s' % login_errors
[0], expected
=True)
147 self
.report_warning('Login has probably failed')
149 def _real_extract(self
, url
):
150 m_url
= self
._match
_valid
_url
(url
)
151 video_id
= m_url
.group('id')
152 blog
= m_url
.group('blog_name')
154 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
155 webpage
, urlh
= self
._download
_webpage
_handle
(url
, video_id
)
157 redirect_url
= urlh
.geturl()
158 if 'tumblr.com/safe-mode' in redirect_url
or redirect_url
.startswith('/safe-mode'):
159 raise ExtractorError(
160 'This Tumblr may contain sensitive media. '
161 'Disable safe mode in your account settings '
162 'at https://www.tumblr.com/settings/account#safe_mode',
165 iframe_url
= self
._search
_regex
(
166 r
'src=\'(https?
://www\
.tumblr\
.com
/video
/[^
\']+)\'',
167 webpage, 'iframe url
', default=None)
168 if iframe_url is None:
169 iframe_url = self._search_regex(
170 r'src
=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']',
171 webpage, 'embed iframe url', default=None)
172 return self.url_result(iframe_url or redirect_url, 'Generic')
174 iframe = self._download_webpage(
175 iframe_url, video_id, 'Downloading iframe page',
176 headers={'Referer': redirect_url})
181 sd_url = self._search_regex(
182 r'<source[^>]+src=(["\'])(?P
<url
>.+?
)\
1', iframe,
183 'sd video url
', default=None, group='url
')
185 sources.append((sd_url, 'sd
'))
187 options = self._parse_json(
189 r'data
-crt
-options
=(["\'])(?P<options>.+?)\1', iframe,
190 'hd video url', default='', group='options'),
191 video_id, fatal=False)
193 duration = int_or_none(options.get('duration'))
194 hd_url = options.get('hdUrl')
196 sources.append((hd_url, 'hd'))
201 'format_id': format_id,
202 'height': int_or_none(self._search_regex(
203 r'/(\d{3,4})$', video_url, 'height', default=None)),
205 } for quality, (video_url, format_id) in enumerate(sources)]
207 self._sort_formats(formats)
209 # The only place where you can get a title, it's not complete,
210 # but searching in other places doesn't work for all videos
211 video_title = self._html_search_regex(
212 r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>',
217 'title': video_title,
218 'description': self._og_search_description(webpage, default=None),
219 'thumbnail': self._og_search_thumbnail(webpage, default=None),
220 'duration': duration,