]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/pornhub.py
[extractor/youtube] Extract `heatmap` data (#7100)
[yt-dlp.git] / yt_dlp / extractor / pornhub.py
CommitLineData
21fbf0f9 1import functools
34541395 2import itertools
df2a5633 3import math
21fbf0f9 4import operator
125cfd78 5import re
ac668111 6import urllib.request
125cfd78 7
8from .common import InfoExtractor
278d061a 9from .openload import PhantomJSwrapper
ac668111 10from ..compat import compat_HTTPError, compat_str
1cc79574 11from ..utils import (
ac668111 12 NO_DEFAULT,
13 ExtractorError,
d0fb4bd1 14 clean_html,
b8526c78 15 determine_ext,
e0ddbd02 16 format_field,
ed8648a3 17 int_or_none,
cd85a1bb 18 merge_dicts,
8f9a477e 19 orderedSet,
e1e35d1a 20 remove_quotes,
4f2a58c9 21 remove_start,
0320ddc1 22 str_to_int,
2181983a 23 update_url_query,
4938c8d5 24 url_or_none,
ac668111 25 urlencode_postdata,
125cfd78 26)
125cfd78 27
9933b574 28
71a1f617 29class PornHubBaseIE(InfoExtractor):
2181983a 30 _NETRC_MACHINE = 'pornhub'
4c968755 31 _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
2181983a 32
71a1f617
S
33 def _download_webpage_handle(self, *args, **kwargs):
34 def dl(*args, **kwargs):
35 return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
36
29f7c58a 37 ret = dl(*args, **kwargs)
38
39 if not ret:
40 return ret
41
42 webpage, urlh = ret
71a1f617
S
43
44 if any(re.search(p, webpage) for p in (
45 r'<body\b[^>]+\bonload=["\']go\(\)',
46 r'document\.cookie\s*=\s*["\']RNKEY=',
47 r'document\.location\.reload\(true\)')):
48 url_or_request = args[0]
49 url = (url_or_request.get_full_url()
ac668111 50 if isinstance(url_or_request, urllib.request.Request)
71a1f617
S
51 else url_or_request)
52 phantom = PhantomJSwrapper(self, required_version='2.0')
53 phantom.get(url, html=webpage)
54 webpage, urlh = dl(*args, **kwargs)
55
56 return webpage, urlh
57
2181983a 58 def _real_initialize(self):
59 self._logged_in = False
60
62beefa8
N
61 def _set_age_cookies(self, host):
62 self._set_cookie(host, 'age_verified', '1')
63 self._set_cookie(host, 'accessAgeDisclaimerPH', '1')
64 self._set_cookie(host, 'accessPH', '1')
65
2181983a 66 def _login(self, host):
67 if self._logged_in:
68 return
69
70 site = host.split('.')[0]
71
72 # Both sites pornhub and pornhubpremium have separate accounts
73 # so there should be an option to provide credentials for both.
74 # At the same time some videos are available under the same video id
75 # on both sites so that we have to identify them as the same video.
76 # For that purpose we have to keep both in the same extractor
77 # but under different netrc machines.
78 username, password = self._get_login_info(netrc_machine=site)
79 if username is None:
80 return
81
82 login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '')
83 login_page = self._download_webpage(
84 login_url, None, 'Downloading %s login page' % site)
85
86 def is_logged(webpage):
87 return any(re.search(p, webpage) for p in (
88 r'class=["\']signOut',
89 r'>Sign\s+[Oo]ut\s*<'))
90
91 if is_logged(login_page):
92 self._logged_in = True
93 return
94
95 login_form = self._hidden_inputs(login_page)
96
97 login_form.update({
98 'username': username,
99 'password': password,
100 })
101
102 response = self._download_json(
103 'https://www.%s/front/authenticate' % host, None,
104 'Logging in to %s' % site,
105 data=urlencode_postdata(login_form),
106 headers={
107 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
108 'Referer': login_url,
109 'X-Requested-With': 'XMLHttpRequest',
110 })
111
112 if response.get('success') == '1':
113 self._logged_in = True
114 return
115
116 message = response.get('message')
117 if message is not None:
118 raise ExtractorError(
119 'Unable to login: %s' % message, expected=True)
120
121 raise ExtractorError('Unable to log in')
122
71a1f617
S
123
124class PornHubIE(PornHubBaseIE):
bc4b2d75
S
125 IE_DESC = 'PornHub and Thumbzilla'
126 _VALID_URL = r'''(?x)
127 https?://
128 (?:
ed807c18 129 (?:[^/]+\.)?
130 %s
131 /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
bc4b2d75
S
132 (?:www\.)?thumbzilla\.com/video/
133 )
b52c9ef1 134 (?P<id>[\da-z]+)
ed807c18 135 ''' % PornHubBaseIE._PORNHUB_HOST_RE
bfd973ec 136 _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
360075e2 137 _TESTS = [{
9933b574 138 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
cd85a1bb 139 'md5': 'a6391306d050e4547f62b3f485dd9ba9',
9933b574 140 'info_dict': {
249efaf4
PH
141 'id': '648719015',
142 'ext': 'mp4',
611c1dd9 143 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
ed8648a3 144 'uploader': 'Babes',
3661ebf2 145 'upload_date': '20130628',
cd85a1bb 146 'timestamp': 1372447216,
ed8648a3
S
147 'duration': 361,
148 'view_count': int,
149 'like_count': int,
150 'dislike_count': int,
151 'comment_count': int,
152 'age_limit': 18,
6bb05b32
YCH
153 'tags': list,
154 'categories': list,
d0fb4bd1 155 'cast': list,
6c376029
S
156 },
157 }, {
158 # non-ASCII title
159 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
160 'info_dict': {
161 'id': '1331683002',
162 'ext': 'mp4',
163 'title': '重庆婷婷女王足交',
3661ebf2 164 'upload_date': '20150213',
cd85a1bb 165 'timestamp': 1423804862,
6c376029
S
166 'duration': 1753,
167 'view_count': int,
168 'like_count': int,
169 'dislike_count': int,
170 'comment_count': int,
171 'age_limit': 18,
6bb05b32
YCH
172 'tags': list,
173 'categories': list,
6c376029
S
174 },
175 'params': {
176 'skip_download': True,
177 },
10db0d2f 178 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
4938c8d5
GF
179 }, {
180 # subtitles
181 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
182 'info_dict': {
183 'id': 'ph5af5fef7c2aa7',
184 'ext': 'mp4',
185 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
186 'uploader': 'BFFs',
187 'duration': 622,
188 'view_count': int,
189 'like_count': int,
190 'dislike_count': int,
191 'comment_count': int,
192 'age_limit': 18,
193 'tags': list,
194 'categories': list,
195 'subtitles': {
196 'en': [{
197 "ext": 'srt'
198 }]
199 },
200 },
201 'params': {
202 'skip_download': True,
203 },
cd85a1bb 204 'skip': 'This video has been disabled',
4f2a58c9
L
205 }, {
206 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a',
207 'info_dict': {
208 'id': 'ph601dc30bae19a',
209 'uploader': 'Projekt Melody',
210 'uploader_id': 'projekt-melody',
211 'upload_date': '20210205',
212 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)',
213 'thumbnail': r're:https?://.+',
214 },
360075e2
S
215 }, {
216 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
217 'only_matching': True,
272e4db5 218 }, {
eaaaaec0 219 # removed at the request of cam4.com
272e4db5
S
220 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
221 'only_matching': True,
eaaaaec0
S
222 }, {
223 # removed at the request of the copyright owner
224 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
225 'only_matching': True,
226 }, {
227 # removed by uploader
228 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
229 'only_matching': True,
195f0845
S
230 }, {
231 # private video
232 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
233 'only_matching': True,
bc4b2d75
S
234 }, {
235 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
236 'only_matching': True,
a99cc4ca
S
237 }, {
238 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
239 'only_matching': True,
f97c0991
S
240 }, {
241 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
242 'only_matching': True,
29f7c58a 243 }, {
244 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933',
245 'only_matching': True,
fa9b8c66
TW
246 }, {
247 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
248 'only_matching': True,
2181983a 249 }, {
250 # Some videos are available with the same id on both premium
251 # and non-premium sites (e.g. this and the following test)
252 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
253 'only_matching': True,
254 }, {
255 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
256 'only_matching': True,
ed807c18 257 }, {
258 # geo restricted
259 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
260 'only_matching': True,
261 }, {
4c968755 262 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
ed807c18 263 'only_matching': True,
360075e2 264 }]
125cfd78 265
0320ddc1 266 def _extract_count(self, pattern, webpage, name):
b69fd25c 267 return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None))
0320ddc1 268
125cfd78 269 def _real_extract(self, url):
5ad28e7f 270 mobj = self._match_valid_url(url)
3430ff9b
S
271 host = mobj.group('host') or 'pornhub.com'
272 video_id = mobj.group('id')
7399ca1f 273
2181983a 274 self._login(host)
62beefa8 275 self._set_age_cookies(host)
125cfd78 276
9a372f14 277 def dl_webpage(platform):
3430ff9b 278 self._set_cookie(host, 'platform', platform)
9a372f14 279 return self._download_webpage(
2c53c0eb 280 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id),
79367a98 281 video_id, 'Downloading %s webpage' % platform)
9a372f14
S
282
283 webpage = dl_webpage('pc')
125cfd78 284
50789175 285 error_msg = self._html_search_regex(
10db0d2f 286 (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
287 r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
3cb3b600 288 webpage, 'error message', default=None, group='error')
50789175
PH
289 if error_msg:
290 error_msg = re.sub(r'\s+', ' ', error_msg)
291 raise ExtractorError(
292 'PornHub said: %s' % error_msg,
293 expected=True, video_id=video_id)
294
ed807c18 295 if any(re.search(p, webpage) for p in (
296 r'class=["\']geoBlocked["\']',
297 r'>\s*This content is unavailable in your country')):
298 self.raise_geo_restricted()
299
6c376029
S
300 # video_title from flashvars contains whitespace instead of non-ASCII (see
301 # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
302 # on that anymore.
79367a98 303 title = self._html_search_meta(
46cc54ca
S
304 'twitter:title', webpage, default=None) or self._html_search_regex(
305 (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>',
306 r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1',
307 r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
6c376029
S
308 webpage, 'title', group='title')
309
79367a98
S
310 video_urls = []
311 video_urls_set = set()
4938c8d5 312 subtitles = {}
79367a98 313
ed8648a3
S
314 flashvars = self._parse_json(
315 self._search_regex(
03442072 316 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
ed8648a3
S
317 video_id)
318 if flashvars:
4938c8d5
GF
319 subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
320 if subtitle_url:
321 subtitles.setdefault('en', []).append({
322 'url': subtitle_url,
323 'ext': 'srt',
324 })
ed8648a3
S
325 thumbnail = flashvars.get('image_url')
326 duration = int_or_none(flashvars.get('video_duration'))
79367a98
S
327 media_definitions = flashvars.get('mediaDefinitions')
328 if isinstance(media_definitions, list):
329 for definition in media_definitions:
330 if not isinstance(definition, dict):
331 continue
332 video_url = definition.get('videoUrl')
333 if not video_url or not isinstance(video_url, compat_str):
334 continue
335 if video_url in video_urls_set:
336 continue
337 video_urls_set.add(video_url)
338 video_urls.append(
339 (video_url, int_or_none(definition.get('quality'))))
ed8648a3 340 else:
79367a98
S
341 thumbnail, duration = [None] * 2
342
0164cd5d 343 def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
79367a98 344 assignments = self._search_regex(
0164cd5d 345 pattern, webpage, 'encoded url', default=default)
f4134726
S
346 if not assignments:
347 return {}
348
349 assignments = assignments.split(';')
79367a98
S
350
351 js_vars = {}
352
353 def parse_js_value(inp):
354 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
355 if '+' in inp:
356 inps = inp.split('+')
357 return functools.reduce(
358 operator.concat, map(parse_js_value, inps))
359 inp = inp.strip()
360 if inp in js_vars:
361 return js_vars[inp]
362 return remove_quotes(inp)
363
364 for assn in assignments:
365 assn = assn.strip()
366 if not assn:
367 continue
368 assn = re.sub(r'var\s+', '', assn)
369 vname, value = assn.split('=', 1)
370 js_vars[vname] = parse_js_value(value)
f4134726 371 return js_vars
79367a98 372
f4134726
S
373 def add_video_url(video_url):
374 v_url = url_or_none(video_url)
375 if not v_url:
376 return
377 if v_url in video_urls_set:
378 return
379 video_urls.append((v_url, None))
380 video_urls_set.add(v_url)
381
29f7c58a 382 def parse_quality_items(quality_items):
383 q_items = self._parse_json(quality_items, video_id, fatal=False)
384 if not isinstance(q_items, list):
385 return
386 for item in q_items:
387 if isinstance(item, dict):
388 add_video_url(item.get('url'))
389
f4134726 390 if not video_urls:
29f7c58a 391 FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
f4134726
S
392 js_vars = extract_js_vars(
393 webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
0164cd5d 394 default=None)
f4134726
S
395 if js_vars:
396 for key, format_url in js_vars.items():
29f7c58a 397 if key.startswith(FORMAT_PREFIXES[-1]):
398 parse_quality_items(format_url)
399 elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
f4134726 400 add_video_url(format_url)
0164cd5d
S
401 if not video_urls and re.search(
402 r'<[^>]+\bid=["\']lockedPlayer', webpage):
403 raise ExtractorError(
404 'Video %s is locked' % video_id, expected=True)
f4134726
S
405
406 if not video_urls:
407 js_vars = extract_js_vars(
408 dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
409 add_video_url(js_vars['mediastring'])
79367a98
S
410
411 for mobj in re.finditer(
412 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
413 webpage):
414 video_url = mobj.group('url')
415 if video_url not in video_urls_set:
416 video_urls.append((video_url, None))
417 video_urls_set.add(video_url)
418
3661ebf2 419 upload_date = None
79367a98 420 formats = []
10db0d2f 421
422 def add_format(format_url, height=None):
f7ad7160 423 ext = determine_ext(format_url)
424 if ext == 'mpd':
425 formats.extend(self._extract_mpd_formats(
426 format_url, video_id, mpd_id='dash', fatal=False))
427 return
428 if ext == 'm3u8':
429 formats.extend(self._extract_m3u8_formats(
430 format_url, video_id, 'mp4', entry_protocol='m3u8_native',
431 m3u8_id='hls', fatal=False))
432 return
ed807c18 433 if not height:
434 height = int_or_none(self._search_regex(
435 r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
436 default=None))
10db0d2f 437 formats.append({
438 'url': format_url,
a70635b8 439 'format_id': format_field(height, None, '%dp'),
10db0d2f 440 'height': height,
10db0d2f 441 })
442
79367a98 443 for video_url, height in video_urls:
3661ebf2
S
444 if not upload_date:
445 upload_date = self._search_regex(
446 r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
447 if upload_date:
448 upload_date = upload_date.replace('/', '')
10db0d2f 449 if '/video/get_media' in video_url:
450 medias = self._download_json(video_url, video_id, fatal=False)
451 if isinstance(medias, list):
452 for media in medias:
453 if not isinstance(media, dict):
454 continue
455 video_url = url_or_none(media.get('videoUrl'))
456 if not video_url:
457 continue
458 height = int_or_none(media.get('quality'))
459 add_format(video_url, height)
460 continue
461 add_format(video_url)
ed807c18 462
4f2a58c9
L
463 model_profile = self._search_json(
464 r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False)
0320ddc1 465 video_uploader = self._html_search_regex(
2d4fe594 466 r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
4f2a58c9 467 webpage, 'uploader', default=None) or model_profile.get('username')
125cfd78 468
29f7c58a 469 def extract_vote_count(kind, name):
470 return self._extract_count(
471 (r'<span[^>]+\bclass="votes%s"[^>]*>([\d,\.]+)</span>' % kind,
472 r'<span[^>]+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind),
473 webpage, name)
474
7700207e 475 view_count = self._extract_count(
540b9f51 476 r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view')
29f7c58a 477 like_count = extract_vote_count('Up', 'like')
478 dislike_count = extract_vote_count('Down', 'dislike')
0320ddc1 479 comment_count = self._extract_count(
7700207e 480 r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
0320ddc1 481
5dda1ede 482 def extract_list(meta_key):
d2d970d0 483 div = self._search_regex(
5dda1ede
S
484 r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
485 % meta_key, webpage, meta_key, default=None)
d2d970d0 486 if div:
d0fb4bd1 487 return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
6bb05b32 488
cd85a1bb
S
489 info = self._search_json_ld(webpage, video_id, default={})
490 # description provided in JSON-LD is irrelevant
491 info['description'] = None
492
493 return merge_dicts({
125cfd78 494 'id': video_id,
495 'uploader': video_uploader,
4f2a58c9 496 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'),
3661ebf2 497 'upload_date': upload_date,
6c376029 498 'title': title,
125cfd78 499 'thumbnail': thumbnail,
ed8648a3 500 'duration': duration,
0320ddc1
S
501 'view_count': view_count,
502 'like_count': like_count,
503 'dislike_count': dislike_count,
504 'comment_count': comment_count,
79367a98 505 'formats': formats,
750e9833 506 'age_limit': 18,
5dda1ede
S
507 'tags': extract_list('tags'),
508 'categories': extract_list('categories'),
d0fb4bd1 509 'cast': extract_list('pornstars'),
4938c8d5 510 'subtitles': subtitles,
cd85a1bb 511 }, info)
e66e1a00
S
512
513
71a1f617 514class PornHubPlaylistBaseIE(PornHubBaseIE):
2181983a 515 def _extract_page(self, url):
516 return int_or_none(self._search_regex(
517 r'\bpage=(\d+)', url, 'page', default=None))
518
3430ff9b 519 def _extract_entries(self, webpage, host):
475bcb22
S
520 # Only process container div with main playlist content skipping
521 # drop-down menu that uses similar pattern for videos (see
067aa17e 522 # https://github.com/ytdl-org/youtube-dl/issues/11594).
475bcb22
S
523 container = self._search_regex(
524 r'(?s)(<div[^>]+class=["\']container.+)', webpage,
525 'container', default=webpage)
526
40e146aa 527 return [
3a23bae9 528 self.url_result(
3430ff9b 529 'http://www.%s/%s' % (host, video_url),
3a23bae9
S
530 PornHubIE.ie_key(), video_title=title)
531 for video_url, title in orderedSet(re.findall(
532 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
475bcb22 533 container))
40e146aa 534 ]
e66e1a00 535
40e146aa 536
21b08463 537class PornHubUserIE(PornHubPlaylistBaseIE):
ed807c18 538 _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE
40e146aa 539 _TESTS = [{
21b08463
S
540 'url': 'https://www.pornhub.com/model/zoe_ph',
541 'playlist_mincount': 118,
542 }, {
543 'url': 'https://www.pornhub.com/pornstar/liz-vicious',
40e146aa 544 'info_dict': {
21b08463 545 'id': 'liz-vicious',
40e146aa 546 },
21b08463
S
547 'playlist_mincount': 118,
548 }, {
549 'url': 'https://www.pornhub.com/users/russianveet69',
9634de17 550 'only_matching': True,
21b08463
S
551 }, {
552 'url': 'https://www.pornhub.com/channels/povd',
9634de17
S
553 'only_matching': True,
554 }, {
555 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
556 'only_matching': True,
2181983a 557 }, {
558 # Unavailable via /videos page, but available with direct pagination
559 # on pornstar page (see [1]), requires premium
560 # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
561 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
562 'only_matching': True,
563 }, {
564 # Same as before, multi page
565 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
566 'only_matching': True,
ed807c18 567 }, {
4c968755 568 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
ed807c18 569 'only_matching': True,
21b08463
S
570 }]
571
21b08463 572 def _real_extract(self, url):
5ad28e7f 573 mobj = self._match_valid_url(url)
21b08463 574 user_id = mobj.group('id')
2181983a 575 videos_url = '%s/videos' % mobj.group('url')
62beefa8 576 self._set_age_cookies(mobj.group('host'))
2181983a 577 page = self._extract_page(url)
578 if page:
579 videos_url = update_url_query(videos_url, {'page': page})
21b08463 580 return self.url_result(
2181983a 581 videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
21b08463
S
582
583
584class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
4bf568d3
S
585 @staticmethod
586 def _has_more(webpage):
587 return re.search(
588 r'''(?x)
589 <li[^>]+\bclass=["\']page_next|
590 <link[^>]+\brel=["\']next|
591 <button[^>]+\bid=["\']moreDataBtn
592 ''', webpage) is not None
593
2181983a 594 def _entries(self, url, host, item_id):
595 page = self._extract_page(url)
21b08463 596
2181983a 597 VIDEOS = '/videos'
598
599 def download_page(base_url, num, fallback=False):
600 note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '')
601 return self._download_webpage(
602 base_url, item_id, note, query={'page': num})
1f7a563a 603
2181983a 604 def is_404(e):
605 return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404
606
607 base_url = url
608 has_page = page is not None
609 first_page = page if has_page else 1
610 for page_num in (first_page, ) if has_page else itertools.count(first_page):
21b08463 611 try:
2181983a 612 try:
613 webpage = download_page(base_url, page_num)
614 except ExtractorError as e:
615 # Some sources may not be available via /videos page,
616 # trying to fallback to main page pagination (see [1])
617 # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
618 if is_404(e) and page_num == first_page and VIDEOS in base_url:
619 base_url = base_url.replace(VIDEOS, '')
620 webpage = download_page(base_url, page_num, fallback=True)
621 else:
622 raise
21b08463 623 except ExtractorError as e:
2181983a 624 if is_404(e) and page_num != first_page:
21b08463
S
625 break
626 raise
627 page_entries = self._extract_entries(webpage, host)
628 if not page_entries:
629 break
2181983a 630 for e in page_entries:
631 yield e
21b08463
S
632 if not self._has_more(webpage):
633 break
634
2181983a 635 def _real_extract(self, url):
5ad28e7f 636 mobj = self._match_valid_url(url)
2181983a 637 host = mobj.group('host')
638 item_id = mobj.group('id')
639
640 self._login(host)
62beefa8 641 self._set_age_cookies(host)
2181983a 642
643 return self.playlist_result(self._entries(url, host, item_id), item_id)
21b08463
S
644
645
9634de17 646class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
df2a5633 647 _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE
21b08463 648 _TESTS = [{
1f7a563a 649 'url': 'https://www.pornhub.com/model/zoe_ph/videos',
21b08463 650 'only_matching': True,
34541395
S
651 }, {
652 'url': 'http://www.pornhub.com/users/rushandlia/videos',
653 'only_matching': True,
21b08463
S
654 }, {
655 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
656 'info_dict': {
9634de17 657 'id': 'pornstar/jenny-blighe/videos',
21b08463
S
658 },
659 'playlist_mincount': 149,
1f7a563a
S
660 }, {
661 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
662 'info_dict': {
9634de17 663 'id': 'pornstar/jenny-blighe/videos',
1f7a563a
S
664 },
665 'playlist_mincount': 40,
f66df20c
PV
666 }, {
667 # default sorting as Top Rated Videos
668 'url': 'https://www.pornhub.com/channels/povd/videos',
669 'info_dict': {
9634de17 670 'id': 'channels/povd/videos',
f66df20c
PV
671 },
672 'playlist_mincount': 293,
673 }, {
674 # Top Rated Videos
675 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
676 'only_matching': True,
677 }, {
678 # Most Recent Videos
679 'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
680 'only_matching': True,
681 }, {
682 # Most Viewed Videos
683 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
684 'only_matching': True,
92ded33a
S
685 }, {
686 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
687 'only_matching': True,
21b08463
S
688 }, {
689 # Most Viewed Videos
690 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
691 'only_matching': True,
692 }, {
693 # Top Rated Videos
694 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
695 'only_matching': True,
696 }, {
697 # Longest Videos
698 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
699 'only_matching': True,
700 }, {
701 # Newest Videos
702 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
703 'only_matching': True,
21b08463
S
704 }, {
705 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
706 'only_matching': True,
707 }, {
708 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
709 'only_matching': True,
9634de17
S
710 }, {
711 'url': 'https://www.pornhub.com/video',
712 'only_matching': True,
713 }, {
714 'url': 'https://www.pornhub.com/video?page=3',
715 'only_matching': True,
716 }, {
717 'url': 'https://www.pornhub.com/video/search?search=123',
718 'only_matching': True,
719 }, {
720 'url': 'https://www.pornhub.com/categories/teen',
721 'only_matching': True,
722 }, {
723 'url': 'https://www.pornhub.com/categories/teen?page=3',
724 'only_matching': True,
725 }, {
726 'url': 'https://www.pornhub.com/hd',
727 'only_matching': True,
728 }, {
729 'url': 'https://www.pornhub.com/hd?page=3',
730 'only_matching': True,
731 }, {
732 'url': 'https://www.pornhub.com/described-video',
733 'only_matching': True,
734 }, {
735 'url': 'https://www.pornhub.com/described-video?page=2',
736 'only_matching': True,
737 }, {
738 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
739 'only_matching': True,
ed807c18 740 }, {
4c968755 741 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
ed807c18 742 'only_matching': True,
40e146aa
S
743 }]
744
21b08463
S
745 @classmethod
746 def suitable(cls, url):
747 return (False
9634de17
S
748 if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
749 else super(PornHubPagedVideoListIE, cls).suitable(url))
21b08463 750
34541395 751
21b08463 752class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
ed807c18 753 _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE
21b08463
S
754 _TESTS = [{
755 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
756 'info_dict': {
757 'id': 'jenny-blighe',
758 },
759 'playlist_mincount': 129,
760 }, {
761 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
762 'only_matching': True,
ed807c18 763 }, {
4c968755 764 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
ed807c18 765 'only_matching': True,
21b08463 766 }]
df2a5633 767
768
769class PornHubPlaylistIE(PornHubPlaylistBaseIE):
770 _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/playlist/(?P<id>[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE
771 _TESTS = [{
772 'url': 'https://www.pornhub.com/playlist/44121572',
773 'info_dict': {
774 'id': '44121572',
775 },
776 'playlist_count': 77,
777 }, {
778 'url': 'https://www.pornhub.com/playlist/4667351',
779 'only_matching': True,
780 }, {
781 'url': 'https://de.pornhub.com/playlist/4667351',
782 'only_matching': True,
783 }, {
784 'url': 'https://de.pornhub.com/playlist/4667351?page=2',
785 'only_matching': True,
786 }]
787
788 def _entries(self, url, host, item_id):
789 webpage = self._download_webpage(url, item_id, 'Downloading page 1')
790 playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id')
791 video_count = int_or_none(
792 self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count'))
793 token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token')
794 page_count = math.ceil((video_count - 36) / 40.) + 1
795 page_entries = self._extract_entries(webpage, host)
796
797 def download_page(page_num):
798 note = 'Downloading page {}'.format(page_num)
799 page_url = 'https://www.{}/playlist/viewChunked'.format(host)
800 return self._download_webpage(page_url, item_id, note, query={
801 'id': playlist_id,
802 'page': page_num,
803 'token': token,
804 })
805
806 for page_num in range(1, page_count + 1):
807 if page_num > 1:
808 webpage = download_page(page_num)
809 page_entries = self._extract_entries(webpage, host)
810 if not page_entries:
811 break
812 for e in page_entries:
813 yield e
814
815 def _real_extract(self, url):
5ad28e7f 816 mobj = self._match_valid_url(url)
df2a5633 817 host = mobj.group('host')
818 item_id = mobj.group('id')
819
820 self._login(host)
62beefa8 821 self._set_age_cookies(host)
df2a5633 822
823 return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)