]>
Commit | Line | Data |
---|---|---|
35409e11 PH |
1 | from __future__ import unicode_literals |
2 | ||
cb10cded PH |
3 | import re |
4 | ||
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
cb10cded | 7 | ExtractorError, |
ccb079ee S |
8 | unified_strdate, |
9 | str_to_int, | |
10 | int_or_none, | |
11 | parse_duration, | |
cb10cded PH |
12 | ) |
13 | ||
14 | ||
15 | class XHamsterIE(InfoExtractor): | |
16 | """Information Extractor for xHamster""" | |
ccb079ee S |
17 | _VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' |
18 | _TESTS = [ | |
19 | { | |
20 | 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', | |
21 | 'md5': '8281348b8d3c53d39fffb377d24eac4e', | |
22 | 'info_dict': { | |
23 | 'id': '1509445', | |
24 | 'ext': 'mp4', | |
25 | 'title': 'FemaleAgent Shy beauty takes the bait', | |
26 | 'upload_date': '20121014', | |
27 | 'uploader_id': 'Ruseful2011', | |
28 | 'duration': 893, | |
29 | 'age_limit': 18, | |
30 | } | |
31 | }, | |
32 | { | |
33 | 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', | |
34 | 'md5': '4cbd8d56708ecb4fb4124c23e4acb81a', | |
35 | 'info_dict': { | |
36 | 'id': '2221348', | |
37 | 'ext': 'mp4', | |
38 | 'title': 'Britney Spears Sexy Booty', | |
39 | 'upload_date': '20130914', | |
40 | 'uploader_id': 'jojo747400', | |
41 | 'duration': 200, | |
42 | 'age_limit': 18, | |
43 | } | |
1237c9a3 | 44 | } |
ccb079ee | 45 | ] |
cb10cded PH |
46 | |
47 | def _real_extract(self,url): | |
5d0c9754 | 48 | def extract_video_url(webpage): |
ccb079ee | 49 | mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage) |
65d78112 | 50 | if mp4 is None: |
ccb079ee | 51 | raise ExtractorError('Unable to extract media URL') |
65d78112 MC |
52 | else: |
53 | return mp4.group(1) | |
54 | ||
5d0c9754 | 55 | def is_hd(webpage): |
22ff1c4a | 56 | return '<div class=\'icon iconHD\'' in webpage |
5d0c9754 | 57 | |
cb10cded PH |
58 | mobj = re.match(self._VALID_URL, url) |
59 | ||
60 | video_id = mobj.group('id') | |
1237c9a3 | 61 | seo = mobj.group('seo') |
5d0c9754 | 62 | mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo) |
cb10cded PH |
63 | webpage = self._download_webpage(mrss_url, video_id) |
64 | ||
ccb079ee | 65 | title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title') |
cb10cded | 66 | |
4353cf51 | 67 | # Only a few videos have an description |
22ff1c4a | 68 | mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) |
ccb079ee | 69 | description = mobj.group(1) if mobj else None |
cb10cded | 70 | |
ccb079ee S |
71 | upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'', |
72 | webpage, 'upload date', fatal=False) | |
73 | if upload_date: | |
74 | upload_date = unified_strdate(upload_date) | |
cb10cded | 75 | |
ccb079ee | 76 | uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', |
22ff1c4a | 77 | webpage, 'uploader id', default='anonymous') |
cb10cded | 78 | |
ccb079ee S |
79 | thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False) |
80 | ||
81 | duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>', | |
82 | webpage, 'duration', fatal=False)) | |
83 | ||
84 | view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False) | |
85 | if view_count: | |
86 | view_count = str_to_int(view_count) | |
87 | ||
88 | mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage) | |
89 | (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) | |
90 | ||
91 | mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage) | |
92 | comment_count = mobj.group('commentcount') if mobj else 0 | |
cb10cded | 93 | |
9d92015d PH |
94 | age_limit = self._rta_search(webpage) |
95 | ||
5d0c9754 | 96 | hd = is_hd(webpage) |
ccb079ee | 97 | |
65d78112 | 98 | video_url = extract_video_url(webpage) |
5d0c9754 | 99 | formats = [{ |
100 | 'url': video_url, | |
5d0c9754 | 101 | 'format_id': 'hd' if hd else 'sd', |
ccb079ee | 102 | 'preference': 1, |
5d0c9754 | 103 | }] |
65d78112 | 104 | |
5d0c9754 | 105 | if not hd: |
ccb079ee | 106 | webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') |
5d0c9754 | 107 | if is_hd(webpage): |
108 | video_url = extract_video_url(webpage) | |
109 | formats.append({ | |
110 | 'url': video_url, | |
5d0c9754 | 111 | 'format_id': 'hd', |
22ff1c4a | 112 | 'preference': 2, |
5d0c9754 | 113 | }) |
114 | ||
22ff1c4a PH |
115 | self._sort_formats(formats) |
116 | ||
5d0c9754 | 117 | return { |
118 | 'id': video_id, | |
ccb079ee S |
119 | 'title': title, |
120 | 'description': description, | |
121 | 'upload_date': upload_date, | |
122 | 'uploader_id': uploader_id, | |
123 | 'thumbnail': thumbnail, | |
124 | 'duration': duration, | |
125 | 'view_count': view_count, | |
126 | 'like_count': int_or_none(like_count), | |
127 | 'dislike_count': int_or_none(dislike_count), | |
128 | 'comment_count': int_or_none(comment_count), | |
9d92015d | 129 | 'age_limit': age_limit, |
ccb079ee | 130 | 'formats': formats, |
5d0c9754 | 131 | } |