[extractor/Netverse] Improve playlist extractor (#3854)

[yt-dlp.git] / yt_dlp / extractor / pornhub.py
diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py

index c2b20ecfdde8d34c36e2ba1605ca51dcc44a001f..35468b4fc801f7577fae6565b2663bd471365f1a 100644 (file)
--- a/yt_dlp/extractor/pornhub.py
+++ b/yt_dlp/extractor/pornhub.py
@@ -1,38 +1,34 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import functools
  import itertools
  import math
  import operator
  import re
+import urllib.request
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_HTTPError,
-    compat_str,
-    compat_urllib_request,
-)
  from .openload import PhantomJSwrapper
+from ..compat import compat_HTTPError, compat_str
  from ..utils import (
+    NO_DEFAULT,
+    ExtractorError,
      clean_html,
      determine_ext,
-    ExtractorError,
+    format_field,
      int_or_none,
      merge_dicts,
-    NO_DEFAULT,
      orderedSet,
      remove_quotes,
+    remove_start,
      str_to_int,
      update_url_query,
-    urlencode_postdata,
      url_or_none,
+    urlencode_postdata,
  )
  
  
  class PornHubBaseIE(InfoExtractor):
      _NETRC_MACHINE = 'pornhub'
-    _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)'
+    _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
  
      def _download_webpage_handle(self, *args, **kwargs):
          def dl(*args, **kwargs):
@@ -51,7 +47,7 @@ def dl(*args, **kwargs):
                  r'document\.location\.reload\(true\)')):
              url_or_request = args[0]
              url = (url_or_request.get_full_url()
-                   if isinstance(url_or_request, compat_urllib_request.Request)
+                   if isinstance(url_or_request, urllib.request.Request)
                     else url_or_request)
              phantom = PhantomJSwrapper(self, required_version='2.0')
              phantom.get(url, html=webpage)
@@ -200,6 +196,16 @@ class PornHubIE(PornHubBaseIE):
              'skip_download': True,
          },
          'skip': 'This video has been disabled',
+    }, {
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a',
+        'info_dict': {
+            'id': 'ph601dc30bae19a',
+            'uploader': 'Projekt Melody',
+            'uploader_id': 'projekt-melody',
+            'upload_date': '20210205',
+            'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)',
+            'thumbnail': r're:https?://.+',
+        },
      }, {
          'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
          'only_matching': True,
@@ -247,7 +253,7 @@ class PornHubIE(PornHubBaseIE):
          'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
          'only_matching': True,
      }, {
-        'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156',
+        'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
          'only_matching': True,
      }]
  
@@ -258,11 +264,10 @@ def _extract_urls(webpage):
              webpage)
  
      def _extract_count(self, pattern, webpage, name):
-        return str_to_int(self._search_regex(
-            pattern, webpage, '%s count' % name, fatal=False))
+        return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None))
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          host = mobj.group('host') or 'pornhub.com'
          video_id = mobj.group('id')
  
@@ -432,7 +437,7 @@ def add_format(format_url, height=None):
                      default=None))
              formats.append({
                  'url': format_url,
-                'format_id': '%dp' % height if height else None,
+                'format_id': format_field(height, None, '%dp'),
                  'height': height,
              })
  
@@ -460,9 +465,11 @@ def add_format(format_url, height=None):
          self._sort_formats(
              formats, field_preference=('height', 'width', 'fps', 'format_id'))
  
+        model_profile = self._search_json(
+            r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False)
          video_uploader = self._html_search_regex(
              r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
-            webpage, 'uploader', default=None)
+            webpage, 'uploader', default=None) or model_profile.get('username')
  
          def extract_vote_count(kind, name):
              return self._extract_count(
@@ -491,6 +498,7 @@ def extract_list(meta_key):
          return merge_dicts({
              'id': video_id,
              'uploader': video_uploader,
+            'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'),
              'upload_date': upload_date,
              'title': title,
              'thumbnail': thumbnail,
@@ -562,12 +570,12 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
          'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
          'only_matching': True,
      }, {
-        'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph',
+        'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
          'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          user_id = mobj.group('id')
          videos_url = '%s/videos' % mobj.group('url')
          page = self._extract_page(url)
@@ -629,7 +637,7 @@ def is_404(e):
                  break
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          host = mobj.group('host')
          item_id = mobj.group('id')
  
@@ -733,7 +741,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
          'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
          'only_matching': True,
      }, {
-        'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos',
+        'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
          'only_matching': True,
      }]
  
@@ -756,7 +764,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
          'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
          'only_matching': True,
      }, {
-        'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload',
+        'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
          'only_matching': True,
      }]
  
@@ -808,7 +816,7 @@ def download_page(page_num):
                  yield e
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          host = mobj.group('host')
          item_id = mobj.group('id')