]>
Commit | Line | Data |
---|---|---|
9933b574 PH |
1 | from __future__ import unicode_literals |
2 | ||
125cfd78 | 3 | import os |
4 | import re | |
5 | ||
6 | from .common import InfoExtractor | |
1cc79574 | 7 | from ..compat import ( |
605cbef6 S |
8 | compat_urllib_parse_unquote, |
9 | compat_urllib_parse_unquote_plus, | |
125cfd78 | 10 | compat_urllib_parse_urlparse, |
1cc79574 PH |
11 | ) |
12 | from ..utils import ( | |
50789175 | 13 | ExtractorError, |
5c2266df | 14 | sanitized_Request, |
0320ddc1 | 15 | str_to_int, |
125cfd78 | 16 | ) |
17 | from ..aes import ( | |
18 | aes_decrypt_text | |
19 | ) | |
20 | ||
9933b574 | 21 | |
125cfd78 | 22 | class PornHubIE(InfoExtractor): |
272e4db5 | 23 | _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' |
360075e2 | 24 | _TESTS = [{ |
9933b574 | 25 | 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', |
9933b574 PH |
26 | 'md5': '882f488fa1f0026f023f33576004a2ed', |
27 | 'info_dict': { | |
249efaf4 PH |
28 | 'id': '648719015', |
29 | 'ext': 'mp4', | |
30 | "uploader": "Babes", | |
9933b574 PH |
31 | "title": "Seductive Indian beauty strips down and fingers her pink pussy", |
32 | "age_limit": 18 | |
125cfd78 | 33 | } |
360075e2 S |
34 | }, { |
35 | 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', | |
36 | 'only_matching': True, | |
272e4db5 S |
37 | }, { |
38 | 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', | |
39 | 'only_matching': True, | |
360075e2 | 40 | }] |
125cfd78 | 41 | |
65d161c4 S |
42 | @classmethod |
43 | def _extract_url(cls, webpage): | |
44 | mobj = re.search( | |
45 | r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) | |
46 | if mobj: | |
47 | return mobj.group('url') | |
48 | ||
0320ddc1 | 49 | def _extract_count(self, pattern, webpage, name): |
7700207e S |
50 | return str_to_int(self._search_regex( |
51 | pattern, webpage, '%s count' % name, fatal=False)) | |
0320ddc1 | 52 | |
125cfd78 | 53 | def _real_extract(self, url): |
249efaf4 | 54 | video_id = self._match_id(url) |
125cfd78 | 55 | |
5c2266df | 56 | req = sanitized_Request( |
9fcbd5db | 57 | 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) |
125cfd78 | 58 | req.add_header('Cookie', 'age_verified=1') |
59 | webpage = self._download_webpage(req, video_id) | |
60 | ||
50789175 PH |
61 | error_msg = self._html_search_regex( |
62 | r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>', | |
63 | webpage, 'error message', default=None) | |
64 | if error_msg: | |
65 | error_msg = re.sub(r'\s+', ' ', error_msg) | |
66 | raise ExtractorError( | |
67 | 'PornHub said: %s' % error_msg, | |
68 | expected=True, video_id=video_id) | |
69 | ||
9933b574 | 70 | video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') |
0320ddc1 | 71 | video_uploader = self._html_search_regex( |
8fc642eb | 72 | r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', |
0320ddc1 | 73 | webpage, 'uploader', fatal=False) |
9933b574 | 74 | thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) |
125cfd78 | 75 | if thumbnail: |
605cbef6 | 76 | thumbnail = compat_urllib_parse_unquote(thumbnail) |
125cfd78 | 77 | |
7700207e S |
78 | view_count = self._extract_count( |
79 | r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') | |
80 | like_count = self._extract_count( | |
81 | r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') | |
82 | dislike_count = self._extract_count( | |
83 | r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') | |
0320ddc1 | 84 | comment_count = self._extract_count( |
7700207e | 85 | r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') |
0320ddc1 | 86 | |
524229a2 | 87 | video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) |
125cfd78 | 88 | if webpage.find('"encrypted":true') != -1: |
605cbef6 | 89 | password = compat_urllib_parse_unquote_plus( |
7a372b64 | 90 | self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) |
125cfd78 | 91 | video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) |
92 | ||
93 | formats = [] | |
94 | for video_url in video_urls: | |
a56f9de1 JMF |
95 | path = compat_urllib_parse_urlparse(video_url).path |
96 | extension = os.path.splitext(path)[1][1:] | |
125cfd78 | 97 | format = path.split('/')[5].split('_')[:2] |
a56f9de1 | 98 | format = "-".join(format) |
9933b574 | 99 | |
8f5639af | 100 | m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format) |
9933b574 PH |
101 | if m is None: |
102 | height = None | |
103 | tbr = None | |
104 | else: | |
105 | height = int(m.group('height')) | |
106 | tbr = int(m.group('tbr')) | |
107 | ||
125cfd78 | 108 | formats.append({ |
109 | 'url': video_url, | |
110 | 'ext': extension, | |
111 | 'format': format, | |
112 | 'format_id': format, | |
9933b574 PH |
113 | 'tbr': tbr, |
114 | 'height': height, | |
125cfd78 | 115 | }) |
9933b574 | 116 | self._sort_formats(formats) |
125cfd78 | 117 | |
118 | return { | |
119 | 'id': video_id, | |
120 | 'uploader': video_uploader, | |
121 | 'title': video_title, | |
122 | 'thumbnail': thumbnail, | |
0320ddc1 S |
123 | 'view_count': view_count, |
124 | 'like_count': like_count, | |
125 | 'dislike_count': dislike_count, | |
126 | 'comment_count': comment_count, | |
125cfd78 | 127 | 'formats': formats, |
750e9833 | 128 | 'age_limit': 18, |
125cfd78 | 129 | } |
e66e1a00 S |
130 | |
131 | ||
132 | class PornHubPlaylistIE(InfoExtractor): | |
133 | _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' | |
134 | _TESTS = [{ | |
135 | 'url': 'http://www.pornhub.com/playlist/6201671', | |
136 | 'info_dict': { | |
137 | 'id': '6201671', | |
138 | 'title': 'P0p4', | |
139 | }, | |
140 | 'playlist_mincount': 35, | |
141 | }] | |
142 | ||
143 | def _real_extract(self, url): | |
144 | playlist_id = self._match_id(url) | |
145 | ||
146 | webpage = self._download_webpage(url, playlist_id) | |
147 | ||
148 | entries = [ | |
149 | self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') | |
d53a4af1 S |
150 | for video_url in set(re.findall( |
151 | r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) | |
e66e1a00 S |
152 | ] |
153 | ||
154 | playlist = self._parse_json( | |
155 | self._search_regex( | |
156 | r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), | |
157 | playlist_id) | |
158 | ||
159 | return self.playlist_result( | |
160 | entries, playlist_id, playlist.get('title'), playlist.get('description')) |