[cleanup, docs] Minor fixes

[yt-dlp.git] / yt_dlp / extractor / pornhub.py
diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py

index 2a7818e41bcc63ae1876f350e28c4e1c14732c88..d712e0036f21338320b648a2758e5d834f8215d0 100644 (file)
--- a/yt_dlp/extractor/pornhub.py
+++ b/yt_dlp/extractor/pornhub.py
@@ -3,6 +3,7 @@
  
  import functools
  import itertools
+import math
  import operator
  import re
  
@@ -14,6 +15,7 @@
  )
  from .openload import PhantomJSwrapper
  from ..utils import (
+    clean_html,
      determine_ext,
      ExtractorError,
      int_or_none,
@@ -30,6 +32,7 @@
  
  class PornHubBaseIE(InfoExtractor):
      _NETRC_MACHINE = 'pornhub'
+    _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
  
      def _download_webpage_handle(self, *args, **kwargs):
          def dl(*args, **kwargs):
@@ -122,11 +125,13 @@ class PornHubIE(PornHubBaseIE):
      _VALID_URL = r'''(?x)
                      https?://
                          (?:
-                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+                            (?:[^/]+\.)?
+                            %s
+                            /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                              (?:www\.)?thumbzilla\.com/video/
                          )
                          (?P<id>[\da-z]+)
-                    '''
+                    ''' % PornHubBaseIE._PORNHUB_HOST_RE
      _TESTS = [{
          'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
          'md5': 'a6391306d050e4547f62b3f485dd9ba9',
@@ -145,6 +150,7 @@ class PornHubIE(PornHubBaseIE):
              'age_limit': 18,
              'tags': list,
              'categories': list,
+            'cast': list,
          },
      }, {
          # non-ASCII title
@@ -236,6 +242,13 @@ class PornHubIE(PornHubBaseIE):
      }, {
          'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
          'only_matching': True,
+    }, {
+        # geo restricted
+        'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
+        'only_matching': True,
+    }, {
+        'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
+        'only_matching': True,
      }]
  
      @staticmethod
@@ -245,11 +258,10 @@ def _extract_urls(webpage):
              webpage)
  
      def _extract_count(self, pattern, webpage, name):
-        return str_to_int(self._search_regex(
-            pattern, webpage, '%s count' % name, fatal=False))
+        return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None))
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          host = mobj.group('host') or 'pornhub.com'
          video_id = mobj.group('id')
  
@@ -275,6 +287,11 @@ def dl_webpage(platform):
                  'PornHub said: %s' % error_msg,
                  expected=True, video_id=video_id)
  
+        if any(re.search(p, webpage) for p in (
+                r'class=["\']geoBlocked["\']',
+                r'>\s*This content is unavailable in your country')):
+            self.raise_geo_restricted()
+
          # video_title from flashvars contains whitespace instead of non-ASCII (see
          # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
          # on that anymore.
@@ -398,17 +415,24 @@ def parse_quality_items(quality_items):
          formats = []
  
          def add_format(format_url, height=None):
-            tbr = None
-            mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', format_url)
-            if mobj:
-                if not height:
-                    height = int(mobj.group('height'))
-                tbr = int(mobj.group('tbr'))
+            ext = determine_ext(format_url)
+            if ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False))
+                return
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                return
+            if not height:
+                height = int_or_none(self._search_regex(
+                    r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
+                    default=None))
              formats.append({
                  'url': format_url,
                  'format_id': '%dp' % height if height else None,
                  'height': height,
-                'tbr': tbr,
              })
  
          for video_url, height in video_urls:
@@ -417,16 +441,6 @@ def add_format(format_url, height=None):
                      r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
                  if upload_date:
                      upload_date = upload_date.replace('/', '')
-            ext = determine_ext(video_url)
-            if ext == 'mpd':
-                formats.extend(self._extract_mpd_formats(
-                    video_url, video_id, mpd_id='dash', fatal=False))
-                continue
-            elif ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
-                continue
              if '/video/get_media' in video_url:
                  medias = self._download_json(video_url, video_id, fatal=False)
                  if isinstance(medias, list):
@@ -440,7 +454,10 @@ def add_format(format_url, height=None):
                          add_format(video_url, height)
                  continue
              add_format(video_url)
-        self._sort_formats(formats)
+
+        # field_preference is unnecessary here, but kept for code-similarity with youtube-dl
+        self._sort_formats(
+            formats, field_preference=('height', 'width', 'fps', 'format_id'))
  
          video_uploader = self._html_search_regex(
              r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
@@ -464,7 +481,7 @@ def extract_list(meta_key):
                  r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
                  % meta_key, webpage, meta_key, default=None)
              if div:
-                return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
+                return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
  
          info = self._search_json_ld(webpage, video_id, default={})
          # description provided in JSON-LD is irrelevant
@@ -485,6 +502,7 @@ def extract_list(meta_key):
              'age_limit': 18,
              'tags': extract_list('tags'),
              'categories': extract_list('categories'),
+            'cast': extract_list('pornstars'),
              'subtitles': subtitles,
          }, info)
  
@@ -513,7 +531,7 @@ def _extract_entries(self, webpage, host):
  
  
  class PornHubUserIE(PornHubPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE
      _TESTS = [{
          'url': 'https://www.pornhub.com/model/zoe_ph',
          'playlist_mincount': 118,
@@ -542,10 +560,13 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
          # Same as before, multi page
          'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
          'only_matching': True,
+    }, {
+        'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          user_id = mobj.group('id')
          videos_url = '%s/videos' % mobj.group('url')
          page = self._extract_page(url)
@@ -607,7 +628,7 @@ def is_404(e):
                  break
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          host = mobj.group('host')
          item_id = mobj.group('id')
  
@@ -617,7 +638,7 @@ def _real_extract(self, url):
  
  
  class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+    _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE
      _TESTS = [{
          'url': 'https://www.pornhub.com/model/zoe_ph/videos',
          'only_matching': True,
@@ -711,16 +732,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
          'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
          'only_matching': True,
      }, {
-        'url': 'https://www.pornhub.com/playlist/44121572',
-        'info_dict': {
-            'id': 'playlist/44121572',
-        },
-        'playlist_mincount': 132,
-    }, {
-        'url': 'https://www.pornhub.com/playlist/4667351',
-        'only_matching': True,
-    }, {
-        'url': 'https://de.pornhub.com/playlist/4667351',
+        'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
          'only_matching': True,
      }]
  
@@ -732,7 +744,7 @@ def suitable(cls, url):
  
  
  class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE
      _TESTS = [{
          'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
          'info_dict': {
@@ -742,4 +754,63 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
      }, {
          'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
          'only_matching': True,
+    }, {
+        'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
+        'only_matching': True,
      }]
+
+
+class PornHubPlaylistIE(PornHubPlaylistBaseIE):
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/playlist/(?P<id>[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE
+    _TESTS = [{
+        'url': 'https://www.pornhub.com/playlist/44121572',
+        'info_dict': {
+            'id': '44121572',
+        },
+        'playlist_count': 77,
+    }, {
+        'url': 'https://www.pornhub.com/playlist/4667351',
+        'only_matching': True,
+    }, {
+        'url': 'https://de.pornhub.com/playlist/4667351',
+        'only_matching': True,
+    }, {
+        'url': 'https://de.pornhub.com/playlist/4667351?page=2',
+        'only_matching': True,
+    }]
+
+    def _entries(self, url, host, item_id):
+        webpage = self._download_webpage(url, item_id, 'Downloading page 1')
+        playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id')
+        video_count = int_or_none(
+            self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count'))
+        token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token')
+        page_count = math.ceil((video_count - 36) / 40.) + 1
+        page_entries = self._extract_entries(webpage, host)
+
+        def download_page(page_num):
+            note = 'Downloading page {}'.format(page_num)
+            page_url = 'https://www.{}/playlist/viewChunked'.format(host)
+            return self._download_webpage(page_url, item_id, note, query={
+                'id': playlist_id,
+                'page': page_num,
+                'token': token,
+            })
+
+        for page_num in range(1, page_count + 1):
+            if page_num > 1:
+                webpage = download_page(page_num)
+                page_entries = self._extract_entries(webpage, host)
+            if not page_entries:
+                break
+            for e in page_entries:
+                yield e
+
+    def _real_extract(self, url):
+        mobj = self._match_valid_url(url)
+        host = mobj.group('host')
+        item_id = mobj.group('id')
+
+        self._login(host)
+
+        return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)