X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/7a5c1cfe93924351387b44919b3c0b2f66c4b883..61edf57f8f13f6dfd81154174e647eb5fdd26089:/yt_dlp/extractor/pornhub.py diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index b7631e4e1..679dc6323 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -1,35 +1,34 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import itertools +import math import operator import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_request, -) from .openload import PhantomJSwrapper +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( - determine_ext, + NO_DEFAULT, ExtractorError, + clean_html, + determine_ext, + format_field, int_or_none, merge_dicts, - NO_DEFAULT, orderedSet, remove_quotes, + remove_start, str_to_int, update_url_query, - urlencode_postdata, url_or_none, + urlencode_postdata, ) class PornHubBaseIE(InfoExtractor): _NETRC_MACHINE = 'pornhub' + _PORNHUB_HOST_RE = r'(?:(?Ppornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)' def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): @@ -47,8 +46,8 @@ def dl(*args, **kwargs): r'document\.cookie\s*=\s*["\']RNKEY=', r'document\.location\.reload\(true\)')): url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, compat_urllib_request.Request) + url = (url_or_request.url + if isinstance(url_or_request, Request) else url_or_request) phantom = PhantomJSwrapper(self, required_version='2.0') phantom.get(url, html=webpage) @@ -59,6 +58,12 @@ def dl(*args, **kwargs): def _real_initialize(self): self._logged_in = False + def _set_age_cookies(self, host): + self._set_cookie(host, 'age_verified', '1') + self._set_cookie(host, 'accessAgeDisclaimerPH', '1') + self._set_cookie(host, 'accessAgeDisclaimerUK', '1') + self._set_cookie(host, 'accessPH', '1') + def _login(self, host): if self._logged_in: return @@ -75,14 +80,14 @@ def _login(self, host): if username is None: return - login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_url = 'https://www.{}/{}login'.format(host, 'premium/' if 'premium' in host else '') login_page = self._download_webpage( - login_url, None, 'Downloading %s login page' % site) + login_url, None, f'Downloading {site} login page') def is_logged(webpage): return any(re.search(p, webpage) for p in ( - r'class=["\']signOut', - r'>Sign\s+[Oo]ut\s*<')) + r'id="profileMenuDropdown"', + r'class="ph-icon-logout"')) if is_logged(login_page): self._logged_in = True @@ -91,13 +96,13 @@ def is_logged(webpage): login_form = self._hidden_inputs(login_page) login_form.update({ - 'username': username, + 'email': username, 'password': password, }) response = self._download_json( - 'https://www.%s/front/authenticate' % host, None, - 'Logging in to %s' % site, + f'https://www.{host}/front/authenticate', None, + f'Logging in to {site}', data=urlencode_postdata(login_form), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', @@ -112,21 +117,24 @@ def is_logged(webpage): message = response.get('message') if message is not None: raise ExtractorError( - 'Unable to login: %s' % message, expected=True) + f'Unable to login: {message}', expected=True) raise ExtractorError('Unable to log in') class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' - _VALID_URL = r'''(?x) + _VALID_URL = rf'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)? + {PornHubBaseIE._PORNHUB_HOST_RE} + /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) ''' + _EMBED_REGEX = [r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)'] _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': 'a6391306d050e4547f62b3f485dd9ba9', @@ -145,6 +153,7 @@ class PornHubIE(PornHubBaseIE): 'age_limit': 18, 'tags': list, 'categories': list, + 'cast': list, }, }, { # non-ASCII title @@ -167,6 +176,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', }, { # subtitles 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', @@ -185,14 +195,24 @@ class PornHubIE(PornHubBaseIE): 'categories': list, 'subtitles': { 'en': [{ - "ext": 'srt' - }] + 'ext': 'srt', + }], }, }, 'params': { 'skip_download': True, }, 'skip': 'This video has been disabled', + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a', + 'info_dict': { + 'id': 'ph601dc30bae19a', + 'uploader': 'Projekt Melody', + 'uploader_id': 'projekt-melody', + 'upload_date': '20210205', + 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)', + 'thumbnail': r're:https?://.+', + }, }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -235,44 +255,49 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', 'only_matching': True, + }, { + # geo restricted + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, + }, { + 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', - webpage) - def _extract_count(self, pattern, webpage, name): - return str_to_int(self._search_regex( - pattern, webpage, '%s count' % name, fatal=False)) + return str_to_int(self._search_regex(pattern, webpage, f'{name} count', default=None)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + mobj = self._match_valid_url(url) host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') self._login(host) - - self._set_cookie(host, 'age_verified', '1') + self._set_age_cookies(host) def dl_webpage(platform): self._set_cookie(host, 'platform', platform) return self._download_webpage( - 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), - video_id, 'Downloading %s webpage' % platform) + f'https://www.{host}/view_video.php?viewkey={video_id}', + video_id, f'Downloading {platform} webpage') webpage = dl_webpage('pc') error_msg = self._html_search_regex( - r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( - 'PornHub said: %s' % error_msg, + f'PornHub said: {error_msg}', expected=True, video_id=video_id) + if any(re.search(p, webpage) for p in ( + r'class=["\']geoBlocked["\']', + r'>\s*This content is unavailable in your country')): + self.raise_geo_restricted() + # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. @@ -306,7 +331,7 @@ def dl_webpage(platform): if not isinstance(definition, dict): continue video_url = definition.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): + if not video_url or not isinstance(video_url, str): continue if video_url in video_urls_set: continue @@ -366,7 +391,7 @@ def parse_quality_items(quality_items): if not video_urls: FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( - webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + webpage, r'(var\s+(?:{})_.+)'.format('|'.join(FORMAT_PREFIXES)), default=None) if js_vars: for key, format_url in js_vars.items(): @@ -377,7 +402,7 @@ def parse_quality_items(quality_items): if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): raise ExtractorError( - 'Video %s is locked' % video_id, expected=True) + f'Video {video_id} is locked', expected=True) if not video_urls: js_vars = extract_js_vars( @@ -394,44 +419,58 @@ def parse_quality_items(quality_items): upload_date = None formats = [] - for video_url, height in video_urls: - if not upload_date: - upload_date = self._search_regex( - r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) - if upload_date: - upload_date = upload_date.replace('/', '') - ext = determine_ext(video_url) + + def add_format(format_url, height=None): + ext = determine_ext(format_url) if ext == 'mpd': formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) - continue - elif ext == 'm3u8': + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', + format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - continue - tbr = None - mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) - if mobj: - if not height: - height = int(mobj.group('height')) - tbr = int(mobj.group('tbr')) + return + if not height: + height = int_or_none(self._search_regex( + r'(?P\d+)[pP]?_\d+[kK]', format_url, 'height', + default=None)) formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else None, + 'url': format_url, + 'format_id': format_field(height, None, '%dp'), 'height': height, - 'tbr': tbr, }) - self._sort_formats(formats) + for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) + continue + add_format(video_url) + + model_profile = self._search_json( + r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', - webpage, 'uploader', default=None) + webpage, 'uploader', default=None) or model_profile.get('username') def extract_vote_count(kind, name): return self._extract_count( - (r']+\bclass="votes%s"[^>]*>([\d,\.]+)' % kind, - r']+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind), + (rf']+\bclass="votes{kind}"[^>]*>([\d,\.]+)', + rf']+\bclass=["\']votes{kind}["\'][^>]*\bdata-rating=["\'](\d+)'), webpage, name) view_count = self._extract_count( @@ -443,10 +482,10 @@ def extract_vote_count(kind, name): def extract_list(meta_key): div = self._search_regex( - r'(?s)]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)' - % meta_key, webpage, meta_key, default=None) + rf'(?s)]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)', + webpage, meta_key, default=None) if div: - return re.findall(r']+\bhref=[^>]+>([^<]+)', div) + return [clean_html(x).strip() for x in re.findall(r'(?s)]+\bhref=[^>]+>.+?', div)] info = self._search_json_ld(webpage, video_id, default={}) # description provided in JSON-LD is irrelevant @@ -455,6 +494,7 @@ def extract_list(meta_key): return merge_dicts({ 'id': video_id, 'uploader': video_uploader, + 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'), 'upload_date': upload_date, 'title': title, 'thumbnail': thumbnail, @@ -467,6 +507,7 @@ def extract_list(meta_key): 'age_limit': 18, 'tags': extract_list('tags'), 'categories': extract_list('categories'), + 'cast': extract_list('pornstars'), 'subtitles': subtitles, }, info) @@ -486,7 +527,7 @@ def _extract_entries(self, webpage, host): return [ self.url_result( - 'http://www.%s/%s' % (host, video_url), + f'http://www.{host}/{video_url}', PornHubIE.ie_key(), video_title=title) for video_url, title in orderedSet(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', @@ -495,7 +536,7 @@ def _extract_entries(self, webpage, host): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = rf'(?Phttps?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -524,12 +565,16 @@ class PornHubUserIE(PornHubPlaylistBaseIE): # Same as before, multi page 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', 'only_matching': True, + }, { + 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + mobj = self._match_valid_url(url) user_id = mobj.group('id') - videos_url = '%s/videos' % mobj.group('url') + videos_url = '{}/videos'.format(mobj.group('url')) + self._set_age_cookies(mobj.group('host')) page = self._extract_page(url) if page: videos_url = update_url_query(videos_url, {'page': page}) @@ -553,12 +598,12 @@ def _entries(self, url, host, item_id): VIDEOS = '/videos' def download_page(base_url, num, fallback=False): - note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') + note = 'Downloading page {}{}'.format(num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) def is_404(e): - return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + return isinstance(e.cause, HTTPError) and e.cause.status == 404 base_url = url has_page = page is not None @@ -589,17 +634,18 @@ def is_404(e): break def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + mobj = self._match_valid_url(url) host = mobj.group('host') item_id = mobj.group('id') self._login(host) + self._set_age_cookies(host) return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?P(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = rf'https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?!playlist/)(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -693,16 +739,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', 'only_matching': True, }, { - 'url': 'https://www.pornhub.com/playlist/44121572', - 'info_dict': { - 'id': 'playlist/44121572', - }, - 'playlist_mincount': 132, - }, { - 'url': 'https://www.pornhub.com/playlist/4667351', - 'only_matching': True, - }, { - 'url': 'https://de.pornhub.com/playlist/4667351', + 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos', 'only_matching': True, }] @@ -710,11 +747,11 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): def suitable(cls, url): return (False if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) - else super(PornHubPagedVideoListIE, cls).suitable(url)) + else super().suitable(url)) class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' + _VALID_URL = rf'(?Phttps?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { @@ -724,4 +761,63 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', 'only_matching': True, + }, { + 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, }] + + +class PornHubPlaylistIE(PornHubPlaylistBaseIE): + _VALID_URL = rf'(?Phttps?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/playlist/(?P[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.pornhub.com/playlist/44121572', + 'info_dict': { + 'id': '44121572', + }, + 'playlist_count': 77, + }, { + 'url': 'https://www.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351?page=2', + 'only_matching': True, + }] + + def _entries(self, url, host, item_id): + webpage = self._download_webpage(url, item_id, 'Downloading page 1') + playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id') + video_count = int_or_none( + self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count')) + token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token') + page_count = math.ceil((video_count - 36) / 40.) + 1 + page_entries = self._extract_entries(webpage, host) + + def download_page(page_num): + note = f'Downloading page {page_num}' + page_url = f'https://www.{host}/playlist/viewChunked' + return self._download_webpage(page_url, item_id, note, query={ + 'id': playlist_id, + 'page': page_num, + 'token': token, + }) + + for page_num in range(1, page_count + 1): + if page_num > 1: + webpage = download_page(page_num) + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + yield from page_entries + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + self._set_age_cookies(host) + + return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)