[pornhub] Extract `cast`

author pukkandan <redacted>

Sun, 13 Jun 2021 16:06:47 +0000 (21:36 +0530)

committer pukkandan <redacted>

Sun, 13 Jun 2021 16:08:08 +0000 (21:38 +0530)
author pukkandan <redacted>
Sun, 13 Jun 2021 16:06:47 +0000 (21:36 +0530)
committer pukkandan <redacted>
Sun, 13 Jun 2021 16:08:08 +0000 (21:38 +0530)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 1524fcb15692685934175a7c5e6385faab1b58fc..b14cf0fc9ba75981dd8b9475044255dc41e14751 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -290,6 +290,7 @@ class InfoExtractor(object):
      categories:     A list of categories that the video falls in, for example
                      ["Sports", "Berlin"]
      tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
+    cast:           A list of the video cast
      is_live:        True, False, or None (=unknown). Whether this video is a
                      live stream that goes on instead of a fixed-length video.
      was_live:       True, False, or None (=unknown). Whether this video was
diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py

index 031454600266de40f0ef8343ba4ce5b9aecc8c4f..cf407a81323a2b775c906cf75e09e01fbef3f7fc 100644 (file)
--- a/yt_dlp/extractor/pornhub.py
+++ b/yt_dlp/extractor/pornhub.py
@@ -14,6 +14,7 @@
  )
  from .openload import PhantomJSwrapper
  from ..utils import (
+    clean_html,
      determine_ext,
      ExtractorError,
      int_or_none,
@@ -145,6 +146,7 @@ class PornHubIE(PornHubBaseIE):
              'age_limit': 18,
              'tags': list,
              'categories': list,
+            'cast': list,
          },
      }, {
          # non-ASCII title
@@ -464,7 +466,7 @@ def extract_list(meta_key):
                  r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
                  % meta_key, webpage, meta_key, default=None)
              if div:
-                return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
+                return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
  
          info = self._search_json_ld(webpage, video_id, default={})
          # description provided in JSON-LD is irrelevant
@@ -485,6 +487,7 @@ def extract_list(meta_key):
              'age_limit': 18,
              'tags': extract_list('tags'),
              'categories': extract_list('categories'),
+            'cast': extract_list('pornstars'),
              'subtitles': subtitles,
          }, info)
author	pukkandan <redacted>
	Sun, 13 Jun 2021 16:06:47 +0000 (21:36 +0530)
committer	pukkandan <redacted>
	Sun, 13 Jun 2021 16:08:08 +0000 (21:38 +0530)
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/pornhub.py		patch \| blob \| blame \| history