]>
Commit | Line | Data |
---|---|---|
35409e11 PH |
1 | from __future__ import unicode_literals |
2 | ||
cb10cded PH |
3 | import re |
4 | ||
5 | from .common import InfoExtractor | |
d852c6bc | 6 | from ..compat import compat_str |
cb10cded | 7 | from ..utils import ( |
85552042 | 8 | clean_html, |
fea92aa6 | 9 | determine_ext, |
065c4b27 | 10 | dict_get, |
103f8c8d | 11 | ExtractorError, |
ccb079ee | 12 | int_or_none, |
51378d35 | 13 | parse_duration, |
fea92aa6 | 14 | try_get, |
44731e30 | 15 | unified_strdate, |
cb10cded PH |
16 | ) |
17 | ||
18 | ||
19 | class XHamsterIE(InfoExtractor): | |
00e5c363 S |
20 | _VALID_URL = r'''(?x) |
21 | https?:// | |
22 | (?:.+?\.)?xhamster\.com/ | |
23 | (?: | |
24 | movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html| | |
25 | videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+) | |
26 | ) | |
27 | ''' | |
28 | ||
6b43132c S |
29 | _TESTS = [{ |
30 | 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', | |
31 | 'md5': '8281348b8d3c53d39fffb377d24eac4e', | |
32 | 'info_dict': { | |
33 | 'id': '1509445', | |
d852c6bc | 34 | 'display_id': 'femaleagent_shy_beauty_takes_the_bait', |
6b43132c S |
35 | 'ext': 'mp4', |
36 | 'title': 'FemaleAgent Shy beauty takes the bait', | |
fea92aa6 | 37 | 'timestamp': 1350194821, |
6b43132c S |
38 | 'upload_date': '20121014', |
39 | 'uploader': 'Ruseful2011', | |
51378d35 | 40 | 'duration': 893, |
6b43132c | 41 | 'age_limit': 18, |
77e499f9 | 42 | 'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Beauti', 'Beauties', 'Beautiful', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy', 'Taking'], |
ccb079ee | 43 | }, |
6b43132c S |
44 | }, { |
45 | 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', | |
46 | 'info_dict': { | |
47 | 'id': '2221348', | |
d852c6bc | 48 | 'display_id': 'britney_spears_sexy_booty', |
6b43132c S |
49 | 'ext': 'mp4', |
50 | 'title': 'Britney Spears Sexy Booty', | |
fea92aa6 | 51 | 'timestamp': 1379123460, |
6b43132c S |
52 | 'upload_date': '20130914', |
53 | 'uploader': 'jojo747400', | |
51378d35 | 54 | 'duration': 200, |
6b43132c | 55 | 'age_limit': 18, |
85552042 | 56 | 'categories': ['Britney Spears', 'Celebrities', 'HD Videos', 'Sexy', 'Sexy Booty'], |
5b9aefef | 57 | }, |
6b43132c S |
58 | 'params': { |
59 | 'skip_download': True, | |
a4690b32 | 60 | }, |
6b43132c S |
61 | }, { |
62 | # empty seo | |
63 | 'url': 'http://xhamster.com/movies/5667973/.html', | |
64 | 'info_dict': { | |
65 | 'id': '5667973', | |
66 | 'ext': 'mp4', | |
67 | 'title': '....', | |
fea92aa6 | 68 | 'timestamp': 1454948101, |
6b43132c S |
69 | 'upload_date': '20160208', |
70 | 'uploader': 'parejafree', | |
51378d35 | 71 | 'duration': 72, |
6b43132c | 72 | 'age_limit': 18, |
85552042 | 73 | 'categories': ['Amateur', 'Blowjobs'], |
5b9aefef | 74 | }, |
6b43132c S |
75 | 'params': { |
76 | 'skip_download': True, | |
77 | }, | |
b271e335 W |
78 | }, { |
79 | # mobile site | |
80 | 'url': 'https://m.xhamster.com/videos/cute-teen-jacqueline-solo-masturbation-8559111', | |
81 | 'only_matching': True, | |
6b43132c S |
82 | }, { |
83 | 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', | |
84 | 'only_matching': True, | |
103f8c8d S |
85 | }, { |
86 | # This video is visible for marcoalfa123456's friends only | |
87 | 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', | |
88 | 'only_matching': True, | |
00e5c363 S |
89 | }, { |
90 | # new URL schema | |
91 | 'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821', | |
92 | 'only_matching': True, | |
6b43132c | 93 | }] |
cb10cded | 94 | |
5f6a1245 | 95 | def _real_extract(self, url): |
cb10cded | 96 | mobj = re.match(self._VALID_URL, url) |
00e5c363 S |
97 | video_id = mobj.group('id') or mobj.group('id_2') |
98 | display_id = mobj.group('display_id') or mobj.group('display_id_2') | |
99 | ||
b271e335 W |
100 | desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) |
101 | webpage = self._download_webpage(desktop_url, video_id) | |
cb10cded | 102 | |
103f8c8d S |
103 | error = self._html_search_regex( |
104 | r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', | |
105 | webpage, 'error', default=None) | |
106 | if error: | |
107 | raise ExtractorError(error, expected=True) | |
108 | ||
fea92aa6 S |
109 | age_limit = self._rta_search(webpage) |
110 | ||
111 | def get_height(s): | |
112 | return int_or_none(self._search_regex( | |
113 | r'^(\d+)[pP]', s, 'height', default=None)) | |
114 | ||
115 | initials = self._parse_json( | |
116 | self._search_regex( | |
117 | r'window\.initials\s*=\s*({.+?})\s*;\s*\n', webpage, 'initials', | |
118 | default='{}'), | |
119 | video_id, fatal=False) | |
120 | if initials: | |
121 | video = initials['videoModel'] | |
122 | title = video['title'] | |
123 | formats = [] | |
124 | for format_id, formats_dict in video['sources'].items(): | |
125 | if not isinstance(formats_dict, dict): | |
126 | continue | |
127 | for quality, format_item in formats_dict.items(): | |
128 | if format_id == 'download': | |
129 | # Download link takes some time to be generated, | |
130 | # skipping for now | |
131 | continue | |
132 | if not isinstance(format_item, dict): | |
133 | continue | |
134 | format_url = format_item.get('link') | |
135 | filesize = int_or_none( | |
136 | format_item.get('size'), invscale=1000000) | |
137 | else: | |
138 | format_url = format_item | |
139 | filesize = None | |
140 | if not isinstance(format_url, compat_str): | |
141 | continue | |
142 | formats.append({ | |
143 | 'format_id': '%s-%s' % (format_id, quality), | |
144 | 'url': format_url, | |
145 | 'ext': determine_ext(format_url, 'mp4'), | |
146 | 'height': get_height(quality), | |
147 | 'filesize': filesize, | |
148 | }) | |
149 | self._sort_formats(formats) | |
150 | ||
151 | categories_list = video.get('categories') | |
152 | if isinstance(categories_list, list): | |
153 | categories = [] | |
154 | for c in categories_list: | |
155 | if not isinstance(c, dict): | |
156 | continue | |
157 | c_name = c.get('name') | |
158 | if isinstance(c_name, compat_str): | |
159 | categories.append(c_name) | |
160 | else: | |
161 | categories = None | |
162 | ||
163 | return { | |
164 | 'id': video_id, | |
165 | 'display_id': display_id, | |
166 | 'title': title, | |
167 | 'description': video.get('description'), | |
168 | 'timestamp': int_or_none(video.get('created')), | |
169 | 'uploader': try_get( | |
170 | video, lambda x: x['author']['name'], compat_str), | |
171 | 'thumbnail': video.get('thumbURL'), | |
172 | 'duration': int_or_none(video.get('duration')), | |
173 | 'view_count': int_or_none(video.get('views')), | |
174 | 'like_count': int_or_none(try_get( | |
175 | video, lambda x: x['rating']['likes'], int)), | |
176 | 'dislike_count': int_or_none(try_get( | |
177 | video, lambda x: x['rating']['dislikes'], int)), | |
178 | 'comment_count': int_or_none(video.get('views')), | |
179 | 'age_limit': age_limit, | |
180 | 'categories': categories, | |
181 | 'formats': formats, | |
182 | } | |
183 | ||
184 | # Old layout fallback | |
185 | ||
4395ca2e | 186 | title = self._html_search_regex( |
1a6d9284 S |
187 | [r'<h1[^>]*>([^<]+)</h1>', |
188 | r'<meta[^>]+itemprop=".*?caption.*?"[^>]+content="(.+?)"', | |
189 | r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], | |
190 | webpage, 'title') | |
cb10cded | 191 | |
d852c6bc S |
192 | formats = [] |
193 | format_urls = set() | |
194 | ||
195 | sources = self._parse_json( | |
196 | self._search_regex( | |
197 | r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', | |
198 | default='{}'), | |
199 | video_id, fatal=False) | |
200 | for format_id, format_url in sources.items(): | |
201 | if not isinstance(format_url, compat_str): | |
202 | continue | |
203 | if format_url in format_urls: | |
204 | continue | |
205 | format_urls.add(format_url) | |
206 | formats.append({ | |
207 | 'format_id': format_id, | |
208 | 'url': format_url, | |
fea92aa6 | 209 | 'height': get_height(format_id), |
d852c6bc S |
210 | }) |
211 | ||
212 | video_url = self._search_regex( | |
213 | [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', | |
214 | r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', | |
215 | r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], | |
216 | webpage, 'video url', group='mp4', default=None) | |
217 | if video_url and video_url not in format_urls: | |
218 | formats.append({ | |
219 | 'url': video_url, | |
220 | }) | |
221 | ||
222 | self._sort_formats(formats) | |
223 | ||
4353cf51 | 224 | # Only a few videos have an description |
22ff1c4a | 225 | mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) |
ccb079ee | 226 | description = mobj.group(1) if mobj else None |
cb10cded | 227 | |
4763b624 S |
228 | upload_date = unified_strdate(self._search_regex( |
229 | r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', | |
230 | webpage, 'upload date', fatal=False)) | |
cb10cded | 231 | |
3e485224 | 232 | uploader = self._html_search_regex( |
a846173d | 233 | r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+><span[^>]+>([^<]+)', |
3e485224 | 234 | webpage, 'uploader', default='anonymous') |
cb10cded | 235 | |
251a44b7 | 236 | thumbnail = self._search_regex( |
b271e335 W |
237 | [r'''["']thumbUrl["']\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', |
238 | r'''<video[^>]+"poster"=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], | |
c73cdd80 | 239 | webpage, 'thumbnail', fatal=False, group='thumbnail') |
ccb079ee | 240 | |
51378d35 | 241 | duration = parse_duration(self._search_regex( |
d852c6bc S |
242 | [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', |
243 | r'Runtime:\s*</span>\s*([\d:]+)'], webpage, | |
51378d35 | 244 | 'duration', fatal=False)) |
ccb079ee | 245 | |
6a16fd4a S |
246 | view_count = int_or_none(self._search_regex( |
247 | r'content=["\']User(?:View|Play)s:(\d+)', | |
248 | webpage, 'view count', fatal=False)) | |
ccb079ee | 249 | |
a846173d | 250 | mobj = re.search(r'hint=[\'"](?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes', webpage) |
ccb079ee S |
251 | (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) |
252 | ||
253 | mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage) | |
254 | comment_count = mobj.group('commentcount') if mobj else 0 | |
cb10cded | 255 | |
85552042 S |
256 | categories_html = self._search_regex( |
257 | r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, | |
258 | 'categories', default=None) | |
259 | categories = [clean_html(category) for category in re.findall( | |
260 | r'<a[^>]+>(.+?)</a>', categories_html)] if categories_html else None | |
261 | ||
5d0c9754 | 262 | return { |
263 | 'id': video_id, | |
d852c6bc | 264 | 'display_id': display_id, |
ccb079ee S |
265 | 'title': title, |
266 | 'description': description, | |
267 | 'upload_date': upload_date, | |
3e485224 | 268 | 'uploader': uploader, |
ccb079ee S |
269 | 'thumbnail': thumbnail, |
270 | 'duration': duration, | |
271 | 'view_count': view_count, | |
272 | 'like_count': int_or_none(like_count), | |
273 | 'dislike_count': int_or_none(dislike_count), | |
274 | 'comment_count': int_or_none(comment_count), | |
9d92015d | 275 | 'age_limit': age_limit, |
85552042 | 276 | 'categories': categories, |
ccb079ee | 277 | 'formats': formats, |
5d0c9754 | 278 | } |
0bbba43e S |
279 | |
280 | ||
281 | class XHamsterEmbedIE(InfoExtractor): | |
b271e335 | 282 | _VALID_URL = r'https?://(?:.+?\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)' |
0bbba43e S |
283 | _TEST = { |
284 | 'url': 'http://xhamster.com/xembed.php?video=3328539', | |
285 | 'info_dict': { | |
286 | 'id': '3328539', | |
287 | 'ext': 'mp4', | |
288 | 'title': 'Pen Masturbation', | |
b271e335 | 289 | 'timestamp': 1406581861, |
0bbba43e | 290 | 'upload_date': '20140728', |
b271e335 | 291 | 'uploader': 'ManyakisArt', |
0bbba43e S |
292 | 'duration': 5, |
293 | 'age_limit': 18, | |
294 | } | |
295 | } | |
296 | ||
2bb5b6d0 S |
297 | @staticmethod |
298 | def _extract_urls(webpage): | |
299 | return [url for _, url in re.findall( | |
300 | r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', | |
301 | webpage)] | |
302 | ||
0bbba43e S |
303 | def _real_extract(self, url): |
304 | video_id = self._match_id(url) | |
305 | ||
306 | webpage = self._download_webpage(url, video_id) | |
307 | ||
308 | video_url = self._search_regex( | |
db962528 | 309 | r'href="(https?://xhamster\.com/(?:movies/{0}/[^"]*\.html|videos/[^/]*-{0})[^"]*)"'.format(video_id), |
065c4b27 S |
310 | webpage, 'xhamster url', default=None) |
311 | ||
312 | if not video_url: | |
313 | vars = self._parse_json( | |
314 | self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), | |
315 | video_id) | |
316 | video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) | |
0bbba43e | 317 | |
25701d5a | 318 | return self.url_result(video_url, 'XHamster') |