]>
Commit | Line | Data |
---|---|---|
9933b574 PH |
1 | from __future__ import unicode_literals |
2 | ||
125cfd78 | 3 | import os |
4 | import re | |
5 | ||
6 | from .common import InfoExtractor | |
1cc79574 | 7 | from ..compat import ( |
605cbef6 S |
8 | compat_urllib_parse_unquote, |
9 | compat_urllib_parse_unquote_plus, | |
125cfd78 | 10 | compat_urllib_parse_urlparse, |
11 | compat_urllib_request, | |
1cc79574 PH |
12 | ) |
13 | from ..utils import ( | |
50789175 | 14 | ExtractorError, |
0320ddc1 | 15 | str_to_int, |
125cfd78 | 16 | ) |
17 | from ..aes import ( | |
18 | aes_decrypt_text | |
19 | ) | |
20 | ||
9933b574 | 21 | |
125cfd78 | 22 | class PornHubIE(InfoExtractor): |
360075e2 S |
23 | _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' |
24 | _TESTS = [{ | |
9933b574 | 25 | 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', |
9933b574 PH |
26 | 'md5': '882f488fa1f0026f023f33576004a2ed', |
27 | 'info_dict': { | |
249efaf4 PH |
28 | 'id': '648719015', |
29 | 'ext': 'mp4', | |
30 | "uploader": "Babes", | |
9933b574 PH |
31 | "title": "Seductive Indian beauty strips down and fingers her pink pussy", |
32 | "age_limit": 18 | |
125cfd78 | 33 | } |
360075e2 S |
34 | }, { |
35 | 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', | |
36 | 'only_matching': True, | |
37 | }] | |
125cfd78 | 38 | |
65d161c4 S |
39 | @classmethod |
40 | def _extract_url(cls, webpage): | |
41 | mobj = re.search( | |
42 | r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) | |
43 | if mobj: | |
44 | return mobj.group('url') | |
45 | ||
0320ddc1 | 46 | def _extract_count(self, pattern, webpage, name): |
7700207e S |
47 | return str_to_int(self._search_regex( |
48 | pattern, webpage, '%s count' % name, fatal=False)) | |
0320ddc1 | 49 | |
125cfd78 | 50 | def _real_extract(self, url): |
249efaf4 | 51 | video_id = self._match_id(url) |
125cfd78 | 52 | |
9fcbd5db S |
53 | req = compat_urllib_request.Request( |
54 | 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) | |
125cfd78 | 55 | req.add_header('Cookie', 'age_verified=1') |
56 | webpage = self._download_webpage(req, video_id) | |
57 | ||
50789175 PH |
58 | error_msg = self._html_search_regex( |
59 | r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>', | |
60 | webpage, 'error message', default=None) | |
61 | if error_msg: | |
62 | error_msg = re.sub(r'\s+', ' ', error_msg) | |
63 | raise ExtractorError( | |
64 | 'PornHub said: %s' % error_msg, | |
65 | expected=True, video_id=video_id) | |
66 | ||
9933b574 | 67 | video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') |
0320ddc1 | 68 | video_uploader = self._html_search_regex( |
8fc642eb | 69 | r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', |
0320ddc1 | 70 | webpage, 'uploader', fatal=False) |
9933b574 | 71 | thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) |
125cfd78 | 72 | if thumbnail: |
605cbef6 | 73 | thumbnail = compat_urllib_parse_unquote(thumbnail) |
125cfd78 | 74 | |
7700207e S |
75 | view_count = self._extract_count( |
76 | r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') | |
77 | like_count = self._extract_count( | |
78 | r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') | |
79 | dislike_count = self._extract_count( | |
80 | r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') | |
0320ddc1 | 81 | comment_count = self._extract_count( |
7700207e | 82 | r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') |
0320ddc1 | 83 | |
605cbef6 | 84 | video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) |
125cfd78 | 85 | if webpage.find('"encrypted":true') != -1: |
605cbef6 | 86 | password = compat_urllib_parse_unquote_plus( |
7a372b64 | 87 | self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) |
125cfd78 | 88 | video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) |
89 | ||
90 | formats = [] | |
91 | for video_url in video_urls: | |
a56f9de1 JMF |
92 | path = compat_urllib_parse_urlparse(video_url).path |
93 | extension = os.path.splitext(path)[1][1:] | |
125cfd78 | 94 | format = path.split('/')[5].split('_')[:2] |
a56f9de1 | 95 | format = "-".join(format) |
9933b574 PH |
96 | |
97 | m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format) | |
98 | if m is None: | |
99 | height = None | |
100 | tbr = None | |
101 | else: | |
102 | height = int(m.group('height')) | |
103 | tbr = int(m.group('tbr')) | |
104 | ||
125cfd78 | 105 | formats.append({ |
106 | 'url': video_url, | |
107 | 'ext': extension, | |
108 | 'format': format, | |
109 | 'format_id': format, | |
9933b574 PH |
110 | 'tbr': tbr, |
111 | 'height': height, | |
125cfd78 | 112 | }) |
9933b574 | 113 | self._sort_formats(formats) |
125cfd78 | 114 | |
115 | return { | |
116 | 'id': video_id, | |
117 | 'uploader': video_uploader, | |
118 | 'title': video_title, | |
119 | 'thumbnail': thumbnail, | |
0320ddc1 S |
120 | 'view_count': view_count, |
121 | 'like_count': like_count, | |
122 | 'dislike_count': dislike_count, | |
123 | 'comment_count': comment_count, | |
125cfd78 | 124 | 'formats': formats, |
750e9833 | 125 | 'age_limit': 18, |
125cfd78 | 126 | } |
e66e1a00 S |
127 | |
128 | ||
129 | class PornHubPlaylistIE(InfoExtractor): | |
130 | _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' | |
131 | _TESTS = [{ | |
132 | 'url': 'http://www.pornhub.com/playlist/6201671', | |
133 | 'info_dict': { | |
134 | 'id': '6201671', | |
135 | 'title': 'P0p4', | |
136 | }, | |
137 | 'playlist_mincount': 35, | |
138 | }] | |
139 | ||
140 | def _real_extract(self, url): | |
141 | playlist_id = self._match_id(url) | |
142 | ||
143 | webpage = self._download_webpage(url, playlist_id) | |
144 | ||
145 | entries = [ | |
146 | self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') | |
147 | for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage)) | |
148 | ] | |
149 | ||
150 | playlist = self._parse_json( | |
151 | self._search_regex( | |
152 | r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), | |
153 | playlist_id) | |
154 | ||
155 | return self.playlist_result( | |
156 | entries, playlist_id, playlist.get('title'), playlist.get('description')) |