]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/pornhub.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / pornhub.py
CommitLineData
21fbf0f9 1import functools
34541395 2import itertools
df2a5633 3import math
21fbf0f9 4import operator
125cfd78 5import re
6
7from .common import InfoExtractor
278d061a 8from .openload import PhantomJSwrapper
3d2623a8 9from ..networking import Request
10from ..networking.exceptions import HTTPError
1cc79574 11from ..utils import (
ac668111 12 NO_DEFAULT,
13 ExtractorError,
d0fb4bd1 14 clean_html,
b8526c78 15 determine_ext,
e0ddbd02 16 format_field,
ed8648a3 17 int_or_none,
cd85a1bb 18 merge_dicts,
8f9a477e 19 orderedSet,
e1e35d1a 20 remove_quotes,
4f2a58c9 21 remove_start,
0320ddc1 22 str_to_int,
2181983a 23 update_url_query,
4938c8d5 24 url_or_none,
ac668111 25 urlencode_postdata,
125cfd78 26)
125cfd78 27
9933b574 28
71a1f617 29class PornHubBaseIE(InfoExtractor):
2181983a 30 _NETRC_MACHINE = 'pornhub'
4c968755 31 _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
2181983a 32
71a1f617
S
33 def _download_webpage_handle(self, *args, **kwargs):
34 def dl(*args, **kwargs):
35 return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
36
29f7c58a 37 ret = dl(*args, **kwargs)
38
39 if not ret:
40 return ret
41
42 webpage, urlh = ret
71a1f617
S
43
44 if any(re.search(p, webpage) for p in (
45 r'<body\b[^>]+\bonload=["\']go\(\)',
46 r'document\.cookie\s*=\s*["\']RNKEY=',
47 r'document\.location\.reload\(true\)')):
48 url_or_request = args[0]
3d2623a8 49 url = (url_or_request.url
50 if isinstance(url_or_request, Request)
71a1f617
S
51 else url_or_request)
52 phantom = PhantomJSwrapper(self, required_version='2.0')
53 phantom.get(url, html=webpage)
54 webpage, urlh = dl(*args, **kwargs)
55
56 return webpage, urlh
57
2181983a 58 def _real_initialize(self):
59 self._logged_in = False
60
62beefa8
N
61 def _set_age_cookies(self, host):
62 self._set_cookie(host, 'age_verified', '1')
63 self._set_cookie(host, 'accessAgeDisclaimerPH', '1')
1d3d579c 64 self._set_cookie(host, 'accessAgeDisclaimerUK', '1')
62beefa8
N
65 self._set_cookie(host, 'accessPH', '1')
66
2181983a 67 def _login(self, host):
68 if self._logged_in:
69 return
70
71 site = host.split('.')[0]
72
73 # Both sites pornhub and pornhubpremium have separate accounts
74 # so there should be an option to provide credentials for both.
75 # At the same time some videos are available under the same video id
76 # on both sites so that we have to identify them as the same video.
77 # For that purpose we have to keep both in the same extractor
78 # but under different netrc machines.
79 username, password = self._get_login_info(netrc_machine=site)
80 if username is None:
81 return
82
add96eb9 83 login_url = 'https://www.{}/{}login'.format(host, 'premium/' if 'premium' in host else '')
2181983a 84 login_page = self._download_webpage(
add96eb9 85 login_url, None, f'Downloading {site} login page')
2181983a 86
87 def is_logged(webpage):
88 return any(re.search(p, webpage) for p in (
de954c1b 89 r'id="profileMenuDropdown"',
90 r'class="ph-icon-logout"'))
2181983a 91
92 if is_logged(login_page):
93 self._logged_in = True
94 return
95
96 login_form = self._hidden_inputs(login_page)
97
98 login_form.update({
518c1afc 99 'email': username,
2181983a 100 'password': password,
101 })
102
103 response = self._download_json(
add96eb9 104 f'https://www.{host}/front/authenticate', None,
105 f'Logging in to {site}',
2181983a 106 data=urlencode_postdata(login_form),
107 headers={
108 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
109 'Referer': login_url,
110 'X-Requested-With': 'XMLHttpRequest',
111 })
112
113 if response.get('success') == '1':
114 self._logged_in = True
115 return
116
117 message = response.get('message')
118 if message is not None:
119 raise ExtractorError(
add96eb9 120 f'Unable to login: {message}', expected=True)
2181983a 121
122 raise ExtractorError('Unable to log in')
123
71a1f617
S
124
125class PornHubIE(PornHubBaseIE):
bc4b2d75 126 IE_DESC = 'PornHub and Thumbzilla'
add96eb9 127 _VALID_URL = rf'''(?x)
bc4b2d75
S
128 https?://
129 (?:
ed807c18 130 (?:[^/]+\.)?
add96eb9 131 {PornHubBaseIE._PORNHUB_HOST_RE}
ed807c18 132 /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
bc4b2d75
S
133 (?:www\.)?thumbzilla\.com/video/
134 )
b52c9ef1 135 (?P<id>[\da-z]+)
add96eb9 136 '''
bfd973ec 137 _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
360075e2 138 _TESTS = [{
9933b574 139 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
cd85a1bb 140 'md5': 'a6391306d050e4547f62b3f485dd9ba9',
9933b574 141 'info_dict': {
249efaf4
PH
142 'id': '648719015',
143 'ext': 'mp4',
611c1dd9 144 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
ed8648a3 145 'uploader': 'Babes',
3661ebf2 146 'upload_date': '20130628',
cd85a1bb 147 'timestamp': 1372447216,
ed8648a3
S
148 'duration': 361,
149 'view_count': int,
150 'like_count': int,
151 'dislike_count': int,
152 'comment_count': int,
153 'age_limit': 18,
6bb05b32
YCH
154 'tags': list,
155 'categories': list,
d0fb4bd1 156 'cast': list,
6c376029
S
157 },
158 }, {
159 # non-ASCII title
160 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
161 'info_dict': {
162 'id': '1331683002',
163 'ext': 'mp4',
164 'title': '重庆婷婷女王足交',
3661ebf2 165 'upload_date': '20150213',
cd85a1bb 166 'timestamp': 1423804862,
6c376029
S
167 'duration': 1753,
168 'view_count': int,
169 'like_count': int,
170 'dislike_count': int,
171 'comment_count': int,
172 'age_limit': 18,
6bb05b32
YCH
173 'tags': list,
174 'categories': list,
6c376029
S
175 },
176 'params': {
177 'skip_download': True,
178 },
10db0d2f 179 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
4938c8d5
GF
180 }, {
181 # subtitles
182 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
183 'info_dict': {
184 'id': 'ph5af5fef7c2aa7',
185 'ext': 'mp4',
186 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
187 'uploader': 'BFFs',
188 'duration': 622,
189 'view_count': int,
190 'like_count': int,
191 'dislike_count': int,
192 'comment_count': int,
193 'age_limit': 18,
194 'tags': list,
195 'categories': list,
196 'subtitles': {
197 'en': [{
add96eb9 198 'ext': 'srt',
199 }],
4938c8d5
GF
200 },
201 },
202 'params': {
203 'skip_download': True,
204 },
cd85a1bb 205 'skip': 'This video has been disabled',
4f2a58c9
L
206 }, {
207 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a',
208 'info_dict': {
209 'id': 'ph601dc30bae19a',
210 'uploader': 'Projekt Melody',
211 'uploader_id': 'projekt-melody',
212 'upload_date': '20210205',
213 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)',
214 'thumbnail': r're:https?://.+',
215 },
360075e2
S
216 }, {
217 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
218 'only_matching': True,
272e4db5 219 }, {
eaaaaec0 220 # removed at the request of cam4.com
272e4db5
S
221 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
222 'only_matching': True,
eaaaaec0
S
223 }, {
224 # removed at the request of the copyright owner
225 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
226 'only_matching': True,
227 }, {
228 # removed by uploader
229 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
230 'only_matching': True,
195f0845
S
231 }, {
232 # private video
233 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
234 'only_matching': True,
bc4b2d75
S
235 }, {
236 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
237 'only_matching': True,
a99cc4ca
S
238 }, {
239 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
240 'only_matching': True,
f97c0991
S
241 }, {
242 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
243 'only_matching': True,
29f7c58a 244 }, {
245 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933',
246 'only_matching': True,
fa9b8c66
TW
247 }, {
248 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
249 'only_matching': True,
2181983a 250 }, {
251 # Some videos are available with the same id on both premium
252 # and non-premium sites (e.g. this and the following test)
253 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
254 'only_matching': True,
255 }, {
256 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
257 'only_matching': True,
ed807c18 258 }, {
259 # geo restricted
260 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
261 'only_matching': True,
262 }, {
4c968755 263 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
ed807c18 264 'only_matching': True,
360075e2 265 }]
125cfd78 266
0320ddc1 267 def _extract_count(self, pattern, webpage, name):
add96eb9 268 return str_to_int(self._search_regex(pattern, webpage, f'{name} count', default=None))
0320ddc1 269
125cfd78 270 def _real_extract(self, url):
5ad28e7f 271 mobj = self._match_valid_url(url)
3430ff9b
S
272 host = mobj.group('host') or 'pornhub.com'
273 video_id = mobj.group('id')
7399ca1f 274
2181983a 275 self._login(host)
62beefa8 276 self._set_age_cookies(host)
125cfd78 277
9a372f14 278 def dl_webpage(platform):
3430ff9b 279 self._set_cookie(host, 'platform', platform)
9a372f14 280 return self._download_webpage(
add96eb9 281 f'https://www.{host}/view_video.php?viewkey={video_id}',
282 video_id, f'Downloading {platform} webpage')
9a372f14
S
283
284 webpage = dl_webpage('pc')
125cfd78 285
50789175 286 error_msg = self._html_search_regex(
10db0d2f 287 (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
288 r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
3cb3b600 289 webpage, 'error message', default=None, group='error')
50789175
PH
290 if error_msg:
291 error_msg = re.sub(r'\s+', ' ', error_msg)
292 raise ExtractorError(
add96eb9 293 f'PornHub said: {error_msg}',
50789175
PH
294 expected=True, video_id=video_id)
295
ed807c18 296 if any(re.search(p, webpage) for p in (
297 r'class=["\']geoBlocked["\']',
298 r'>\s*This content is unavailable in your country')):
299 self.raise_geo_restricted()
300
6c376029
S
301 # video_title from flashvars contains whitespace instead of non-ASCII (see
302 # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
303 # on that anymore.
79367a98 304 title = self._html_search_meta(
46cc54ca
S
305 'twitter:title', webpage, default=None) or self._html_search_regex(
306 (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>',
307 r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1',
308 r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
6c376029
S
309 webpage, 'title', group='title')
310
79367a98
S
311 video_urls = []
312 video_urls_set = set()
4938c8d5 313 subtitles = {}
79367a98 314
ed8648a3
S
315 flashvars = self._parse_json(
316 self._search_regex(
03442072 317 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
ed8648a3
S
318 video_id)
319 if flashvars:
4938c8d5
GF
320 subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
321 if subtitle_url:
322 subtitles.setdefault('en', []).append({
323 'url': subtitle_url,
324 'ext': 'srt',
325 })
ed8648a3
S
326 thumbnail = flashvars.get('image_url')
327 duration = int_or_none(flashvars.get('video_duration'))
79367a98
S
328 media_definitions = flashvars.get('mediaDefinitions')
329 if isinstance(media_definitions, list):
330 for definition in media_definitions:
331 if not isinstance(definition, dict):
332 continue
333 video_url = definition.get('videoUrl')
add96eb9 334 if not video_url or not isinstance(video_url, str):
79367a98
S
335 continue
336 if video_url in video_urls_set:
337 continue
338 video_urls_set.add(video_url)
339 video_urls.append(
340 (video_url, int_or_none(definition.get('quality'))))
ed8648a3 341 else:
79367a98
S
342 thumbnail, duration = [None] * 2
343
0164cd5d 344 def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
79367a98 345 assignments = self._search_regex(
0164cd5d 346 pattern, webpage, 'encoded url', default=default)
f4134726
S
347 if not assignments:
348 return {}
349
350 assignments = assignments.split(';')
79367a98
S
351
352 js_vars = {}
353
354 def parse_js_value(inp):
355 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
356 if '+' in inp:
357 inps = inp.split('+')
358 return functools.reduce(
359 operator.concat, map(parse_js_value, inps))
360 inp = inp.strip()
361 if inp in js_vars:
362 return js_vars[inp]
363 return remove_quotes(inp)
364
365 for assn in assignments:
366 assn = assn.strip()
367 if not assn:
368 continue
369 assn = re.sub(r'var\s+', '', assn)
370 vname, value = assn.split('=', 1)
371 js_vars[vname] = parse_js_value(value)
f4134726 372 return js_vars
79367a98 373
f4134726
S
374 def add_video_url(video_url):
375 v_url = url_or_none(video_url)
376 if not v_url:
377 return
378 if v_url in video_urls_set:
379 return
380 video_urls.append((v_url, None))
381 video_urls_set.add(v_url)
382
29f7c58a 383 def parse_quality_items(quality_items):
384 q_items = self._parse_json(quality_items, video_id, fatal=False)
385 if not isinstance(q_items, list):
386 return
387 for item in q_items:
388 if isinstance(item, dict):
389 add_video_url(item.get('url'))
390
f4134726 391 if not video_urls:
29f7c58a 392 FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
f4134726 393 js_vars = extract_js_vars(
add96eb9 394 webpage, r'(var\s+(?:{})_.+)'.format('|'.join(FORMAT_PREFIXES)),
0164cd5d 395 default=None)
f4134726
S
396 if js_vars:
397 for key, format_url in js_vars.items():
29f7c58a 398 if key.startswith(FORMAT_PREFIXES[-1]):
399 parse_quality_items(format_url)
400 elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
f4134726 401 add_video_url(format_url)
0164cd5d
S
402 if not video_urls and re.search(
403 r'<[^>]+\bid=["\']lockedPlayer', webpage):
404 raise ExtractorError(
add96eb9 405 f'Video {video_id} is locked', expected=True)
f4134726
S
406
407 if not video_urls:
408 js_vars = extract_js_vars(
409 dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
410 add_video_url(js_vars['mediastring'])
79367a98
S
411
412 for mobj in re.finditer(
413 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
414 webpage):
415 video_url = mobj.group('url')
416 if video_url not in video_urls_set:
417 video_urls.append((video_url, None))
418 video_urls_set.add(video_url)
419
3661ebf2 420 upload_date = None
79367a98 421 formats = []
10db0d2f 422
423 def add_format(format_url, height=None):
f7ad7160 424 ext = determine_ext(format_url)
425 if ext == 'mpd':
426 formats.extend(self._extract_mpd_formats(
427 format_url, video_id, mpd_id='dash', fatal=False))
428 return
429 if ext == 'm3u8':
430 formats.extend(self._extract_m3u8_formats(
431 format_url, video_id, 'mp4', entry_protocol='m3u8_native',
432 m3u8_id='hls', fatal=False))
433 return
ed807c18 434 if not height:
435 height = int_or_none(self._search_regex(
436 r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
437 default=None))
10db0d2f 438 formats.append({
439 'url': format_url,
a70635b8 440 'format_id': format_field(height, None, '%dp'),
10db0d2f 441 'height': height,
10db0d2f 442 })
443
79367a98 444 for video_url, height in video_urls:
3661ebf2
S
445 if not upload_date:
446 upload_date = self._search_regex(
447 r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
448 if upload_date:
449 upload_date = upload_date.replace('/', '')
10db0d2f 450 if '/video/get_media' in video_url:
451 medias = self._download_json(video_url, video_id, fatal=False)
452 if isinstance(medias, list):
453 for media in medias:
454 if not isinstance(media, dict):
455 continue
456 video_url = url_or_none(media.get('videoUrl'))
457 if not video_url:
458 continue
459 height = int_or_none(media.get('quality'))
460 add_format(video_url, height)
461 continue
462 add_format(video_url)
ed807c18 463
4f2a58c9
L
464 model_profile = self._search_json(
465 r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False)
0320ddc1 466 video_uploader = self._html_search_regex(
2d4fe594 467 r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
4f2a58c9 468 webpage, 'uploader', default=None) or model_profile.get('username')
125cfd78 469
29f7c58a 470 def extract_vote_count(kind, name):
471 return self._extract_count(
add96eb9 472 (rf'<span[^>]+\bclass="votes{kind}"[^>]*>([\d,\.]+)</span>',
473 rf'<span[^>]+\bclass=["\']votes{kind}["\'][^>]*\bdata-rating=["\'](\d+)'),
29f7c58a 474 webpage, name)
475
7700207e 476 view_count = self._extract_count(
540b9f51 477 r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view')
29f7c58a 478 like_count = extract_vote_count('Up', 'like')
479 dislike_count = extract_vote_count('Down', 'dislike')
0320ddc1 480 comment_count = self._extract_count(
7700207e 481 r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
0320ddc1 482
5dda1ede 483 def extract_list(meta_key):
d2d970d0 484 div = self._search_regex(
add96eb9 485 rf'(?s)<div[^>]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)</div>',
486 webpage, meta_key, default=None)
d2d970d0 487 if div:
d0fb4bd1 488 return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
6bb05b32 489
cd85a1bb
S
490 info = self._search_json_ld(webpage, video_id, default={})
491 # description provided in JSON-LD is irrelevant
492 info['description'] = None
493
494 return merge_dicts({
125cfd78 495 'id': video_id,
496 'uploader': video_uploader,
4f2a58c9 497 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'),
3661ebf2 498 'upload_date': upload_date,
6c376029 499 'title': title,
125cfd78 500 'thumbnail': thumbnail,
ed8648a3 501 'duration': duration,
0320ddc1
S
502 'view_count': view_count,
503 'like_count': like_count,
504 'dislike_count': dislike_count,
505 'comment_count': comment_count,
79367a98 506 'formats': formats,
750e9833 507 'age_limit': 18,
5dda1ede
S
508 'tags': extract_list('tags'),
509 'categories': extract_list('categories'),
d0fb4bd1 510 'cast': extract_list('pornstars'),
4938c8d5 511 'subtitles': subtitles,
cd85a1bb 512 }, info)
e66e1a00
S
513
514
71a1f617 515class PornHubPlaylistBaseIE(PornHubBaseIE):
2181983a 516 def _extract_page(self, url):
517 return int_or_none(self._search_regex(
518 r'\bpage=(\d+)', url, 'page', default=None))
519
3430ff9b 520 def _extract_entries(self, webpage, host):
475bcb22
S
521 # Only process container div with main playlist content skipping
522 # drop-down menu that uses similar pattern for videos (see
067aa17e 523 # https://github.com/ytdl-org/youtube-dl/issues/11594).
475bcb22
S
524 container = self._search_regex(
525 r'(?s)(<div[^>]+class=["\']container.+)', webpage,
526 'container', default=webpage)
527
40e146aa 528 return [
3a23bae9 529 self.url_result(
add96eb9 530 f'http://www.{host}/{video_url}',
3a23bae9
S
531 PornHubIE.ie_key(), video_title=title)
532 for video_url, title in orderedSet(re.findall(
533 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
475bcb22 534 container))
40e146aa 535 ]
e66e1a00 536
40e146aa 537
21b08463 538class PornHubUserIE(PornHubPlaylistBaseIE):
add96eb9 539 _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
40e146aa 540 _TESTS = [{
21b08463
S
541 'url': 'https://www.pornhub.com/model/zoe_ph',
542 'playlist_mincount': 118,
543 }, {
544 'url': 'https://www.pornhub.com/pornstar/liz-vicious',
40e146aa 545 'info_dict': {
21b08463 546 'id': 'liz-vicious',
40e146aa 547 },
21b08463
S
548 'playlist_mincount': 118,
549 }, {
550 'url': 'https://www.pornhub.com/users/russianveet69',
9634de17 551 'only_matching': True,
21b08463
S
552 }, {
553 'url': 'https://www.pornhub.com/channels/povd',
9634de17
S
554 'only_matching': True,
555 }, {
556 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
557 'only_matching': True,
2181983a 558 }, {
559 # Unavailable via /videos page, but available with direct pagination
560 # on pornstar page (see [1]), requires premium
561 # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
562 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
563 'only_matching': True,
564 }, {
565 # Same as before, multi page
566 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
567 'only_matching': True,
ed807c18 568 }, {
4c968755 569 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
ed807c18 570 'only_matching': True,
21b08463
S
571 }]
572
21b08463 573 def _real_extract(self, url):
5ad28e7f 574 mobj = self._match_valid_url(url)
21b08463 575 user_id = mobj.group('id')
add96eb9 576 videos_url = '{}/videos'.format(mobj.group('url'))
62beefa8 577 self._set_age_cookies(mobj.group('host'))
2181983a 578 page = self._extract_page(url)
579 if page:
580 videos_url = update_url_query(videos_url, {'page': page})
21b08463 581 return self.url_result(
2181983a 582 videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
21b08463
S
583
584
585class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
4bf568d3
S
586 @staticmethod
587 def _has_more(webpage):
588 return re.search(
589 r'''(?x)
590 <li[^>]+\bclass=["\']page_next|
591 <link[^>]+\brel=["\']next|
592 <button[^>]+\bid=["\']moreDataBtn
593 ''', webpage) is not None
594
2181983a 595 def _entries(self, url, host, item_id):
596 page = self._extract_page(url)
21b08463 597
2181983a 598 VIDEOS = '/videos'
599
600 def download_page(base_url, num, fallback=False):
add96eb9 601 note = 'Downloading page {}{}'.format(num, ' (switch to fallback)' if fallback else '')
2181983a 602 return self._download_webpage(
603 base_url, item_id, note, query={'page': num})
1f7a563a 604
2181983a 605 def is_404(e):
3d2623a8 606 return isinstance(e.cause, HTTPError) and e.cause.status == 404
2181983a 607
608 base_url = url
609 has_page = page is not None
610 first_page = page if has_page else 1
611 for page_num in (first_page, ) if has_page else itertools.count(first_page):
21b08463 612 try:
2181983a 613 try:
614 webpage = download_page(base_url, page_num)
615 except ExtractorError as e:
616 # Some sources may not be available via /videos page,
617 # trying to fallback to main page pagination (see [1])
618 # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
619 if is_404(e) and page_num == first_page and VIDEOS in base_url:
620 base_url = base_url.replace(VIDEOS, '')
621 webpage = download_page(base_url, page_num, fallback=True)
622 else:
623 raise
21b08463 624 except ExtractorError as e:
2181983a 625 if is_404(e) and page_num != first_page:
21b08463
S
626 break
627 raise
628 page_entries = self._extract_entries(webpage, host)
629 if not page_entries:
630 break
2181983a 631 for e in page_entries:
632 yield e
21b08463
S
633 if not self._has_more(webpage):
634 break
635
2181983a 636 def _real_extract(self, url):
5ad28e7f 637 mobj = self._match_valid_url(url)
2181983a 638 host = mobj.group('host')
639 item_id = mobj.group('id')
640
641 self._login(host)
62beefa8 642 self._set_age_cookies(host)
2181983a 643
644 return self.playlist_result(self._entries(url, host, item_id), item_id)
21b08463
S
645
646
9634de17 647class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
add96eb9 648 _VALID_URL = rf'https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)'
21b08463 649 _TESTS = [{
1f7a563a 650 'url': 'https://www.pornhub.com/model/zoe_ph/videos',
21b08463 651 'only_matching': True,
34541395
S
652 }, {
653 'url': 'http://www.pornhub.com/users/rushandlia/videos',
654 'only_matching': True,
21b08463
S
655 }, {
656 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
657 'info_dict': {
9634de17 658 'id': 'pornstar/jenny-blighe/videos',
21b08463
S
659 },
660 'playlist_mincount': 149,
1f7a563a
S
661 }, {
662 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
663 'info_dict': {
9634de17 664 'id': 'pornstar/jenny-blighe/videos',
1f7a563a
S
665 },
666 'playlist_mincount': 40,
f66df20c
PV
667 }, {
668 # default sorting as Top Rated Videos
669 'url': 'https://www.pornhub.com/channels/povd/videos',
670 'info_dict': {
9634de17 671 'id': 'channels/povd/videos',
f66df20c
PV
672 },
673 'playlist_mincount': 293,
674 }, {
675 # Top Rated Videos
676 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
677 'only_matching': True,
678 }, {
679 # Most Recent Videos
680 'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
681 'only_matching': True,
682 }, {
683 # Most Viewed Videos
684 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
685 'only_matching': True,
92ded33a
S
686 }, {
687 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
688 'only_matching': True,
21b08463
S
689 }, {
690 # Most Viewed Videos
691 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
692 'only_matching': True,
693 }, {
694 # Top Rated Videos
695 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
696 'only_matching': True,
697 }, {
698 # Longest Videos
699 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
700 'only_matching': True,
701 }, {
702 # Newest Videos
703 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
704 'only_matching': True,
21b08463
S
705 }, {
706 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
707 'only_matching': True,
708 }, {
709 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
710 'only_matching': True,
9634de17
S
711 }, {
712 'url': 'https://www.pornhub.com/video',
713 'only_matching': True,
714 }, {
715 'url': 'https://www.pornhub.com/video?page=3',
716 'only_matching': True,
717 }, {
718 'url': 'https://www.pornhub.com/video/search?search=123',
719 'only_matching': True,
720 }, {
721 'url': 'https://www.pornhub.com/categories/teen',
722 'only_matching': True,
723 }, {
724 'url': 'https://www.pornhub.com/categories/teen?page=3',
725 'only_matching': True,
726 }, {
727 'url': 'https://www.pornhub.com/hd',
728 'only_matching': True,
729 }, {
730 'url': 'https://www.pornhub.com/hd?page=3',
731 'only_matching': True,
732 }, {
733 'url': 'https://www.pornhub.com/described-video',
734 'only_matching': True,
735 }, {
736 'url': 'https://www.pornhub.com/described-video?page=2',
737 'only_matching': True,
738 }, {
739 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
740 'only_matching': True,
ed807c18 741 }, {
4c968755 742 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
ed807c18 743 'only_matching': True,
40e146aa
S
744 }]
745
21b08463
S
746 @classmethod
747 def suitable(cls, url):
748 return (False
9634de17 749 if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
add96eb9 750 else super().suitable(url))
21b08463 751
34541395 752
21b08463 753class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
add96eb9 754 _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
21b08463
S
755 _TESTS = [{
756 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
757 'info_dict': {
758 'id': 'jenny-blighe',
759 },
760 'playlist_mincount': 129,
761 }, {
762 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
763 'only_matching': True,
ed807c18 764 }, {
4c968755 765 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
ed807c18 766 'only_matching': True,
21b08463 767 }]
df2a5633 768
769
770class PornHubPlaylistIE(PornHubPlaylistBaseIE):
add96eb9 771 _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/playlist/(?P<id>[^/?#&]+))'
df2a5633 772 _TESTS = [{
773 'url': 'https://www.pornhub.com/playlist/44121572',
774 'info_dict': {
775 'id': '44121572',
776 },
777 'playlist_count': 77,
778 }, {
779 'url': 'https://www.pornhub.com/playlist/4667351',
780 'only_matching': True,
781 }, {
782 'url': 'https://de.pornhub.com/playlist/4667351',
783 'only_matching': True,
784 }, {
785 'url': 'https://de.pornhub.com/playlist/4667351?page=2',
786 'only_matching': True,
787 }]
788
789 def _entries(self, url, host, item_id):
790 webpage = self._download_webpage(url, item_id, 'Downloading page 1')
791 playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id')
792 video_count = int_or_none(
793 self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count'))
794 token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token')
795 page_count = math.ceil((video_count - 36) / 40.) + 1
796 page_entries = self._extract_entries(webpage, host)
797
798 def download_page(page_num):
add96eb9 799 note = f'Downloading page {page_num}'
800 page_url = f'https://www.{host}/playlist/viewChunked'
df2a5633 801 return self._download_webpage(page_url, item_id, note, query={
802 'id': playlist_id,
803 'page': page_num,
804 'token': token,
805 })
806
807 for page_num in range(1, page_count + 1):
808 if page_num > 1:
809 webpage = download_page(page_num)
810 page_entries = self._extract_entries(webpage, host)
811 if not page_entries:
812 break
add96eb9 813 yield from page_entries
df2a5633 814
815 def _real_extract(self, url):
5ad28e7f 816 mobj = self._match_valid_url(url)
df2a5633 817 host = mobj.group('host')
818 item_id = mobj.group('id')
819
820 self._login(host)
62beefa8 821 self._set_age_cookies(host)
df2a5633 822
823 return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)