[extractor] Framework for embed detection (#4307)

author pukkandan <redacted>

Mon, 1 Aug 2022 01:22:03 +0000 (06:52 +0530)

committer pukkandan <redacted>

Mon, 1 Aug 2022 19:38:16 +0000 (01:08 +0530)
author pukkandan <redacted>
Mon, 1 Aug 2022 01:22:03 +0000 (06:52 +0530)
committer pukkandan <redacted>
Mon, 1 Aug 2022 19:38:16 +0000 (01:08 +0530)
diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py

index cdafaf1ef6799dedc2bac12f97582411da530d87..a6e26b6f63c33a1c044eb90bedb1cab872c5f1b1 100644 (file)
--- a/devscripts/lazy_load_template.py
+++ b/devscripts/lazy_load_template.py
@@ -9,11 +9,13 @@
      write_string,
  )
  
+# These bloat the lazy_extractors, so allow them to passthrough silently
+ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}
+
  
  class LazyLoadMetaClass(type):
      def __getattr__(cls, name):
-        # "_TESTS" bloat the lazy_extractors
-        if '_real_class' not in cls.__dict__ and name != 'get_testcases':
+        if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
              write_string(
                  'WARNING: Falling back to normal extractor since lazy extractor '
                  f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py

index 60fcc5ef0208d3885d38e9b4b42f07957fb2588c..c9fdfb56235a532484d4938d6957333e53bd7975 100644 (file)
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@@ -11,7 +11,7 @@
  from inspect import getsource
  
  NO_ATTR = object()
-STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit']
+STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
  CLASS_METHODS = [
      'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
  ]
@@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):
      }.get(base.__name__, base.__name__) for base in ie.__bases__)
  
      s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
-    valid_url = getattr(ie, '_VALID_URL', None)
-    if not valid_url and hasattr(ie, '_make_valid_url'):
-        valid_url = ie._make_valid_url()
-    if valid_url:
-        s += f'    _VALID_URL = {valid_url!r}\n'
      return s + '\n'.join(extra_ie_code(ie, attr_base))
  
  
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index ce8ac2e89f7fab7feeefa0e0a021653180f1fd4d..f6f97b8ece9628e549769b8d2b613fef0267bbde 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -1566,7 +1566,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
          result_type = ie_result.get('_type', 'video')
  
          if result_type in ('url', 'url_transparent'):
-            ie_result['url'] = sanitize_url(ie_result['url'])
+            ie_result['url'] = sanitize_url(
+                ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
              if ie_result.get('original_url'):
                  extra_info.setdefault('original_url', ie_result['original_url'])
  
diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py

index a5412897d6b191f5283e5de78b52f4ce02c56df3..99a216fb49818dfd74010a3bf23551201bac1b13 100644 (file)
--- a/yt_dlp/extractor/brightcove.py
+++ b/yt_dlp/extractor/brightcove.py
@@ -402,11 +402,11 @@ class BrightcoveNewIE(AdobePassIE):
  
      @staticmethod
      def _extract_url(ie, webpage):
-        urls = BrightcoveNewIE._extract_urls(ie, webpage)
+        urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
          return urls[0] if urls else None
  
      @staticmethod
-    def _extract_urls(ie, webpage):
+    def _extract_brightcove_urls(ie, webpage):
          # Reference:
          # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
          # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index d168763e0bcfa2165ee2ed188e763fa4252253ca..b8347fe4cf1767d7d7338afea8e242c02d499356 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -14,6 +14,7 @@
  import re
  import sys
  import time
+import types
  import urllib.parse
  import urllib.request
  import xml.etree.ElementTree
@@ -23,6 +24,7 @@
  from ..downloader import FileDownloader
  from ..downloader.f4m import get_base_url, remove_encrypted_media
  from ..utils import (
+    IDENTITY,
      JSON_LD_RE,
      NO_DEFAULT,
      ExtractorError,
@@ -59,6 +61,7 @@
      parse_m3u8_attributes,
      parse_resolution,
      sanitize_filename,
+    sanitize_url,
      sanitized_Request,
      str_or_none,
      str_to_int,
@@ -431,14 +434,26 @@ class InfoExtractor:
      title, description etc.
  
  
-    Subclasses of this should define a _VALID_URL regexp and, re-define the
-    _real_extract() and (optionally) _real_initialize() methods.
-    Probably, they should also be added to the list of extractors.
+    Subclasses of this should also be added to the list of extractors and
+    should define a _VALID_URL regexp and, re-define the _real_extract() and
+    (optionally) _real_initialize() methods.
  
      Subclasses may also override suitable() if necessary, but ensure the function
      signature is preserved and that this function imports everything it needs
      (except other extractors), so that lazy_extractors works correctly.
  
+    Subclasses can define a list of _EMBED_REGEX, which will be searched for in
+    the HTML of Generic webpages. It may also override _extract_embed_urls
+    or _extract_from_webpage as necessary. While these are normally classmethods,
+    _extract_from_webpage is allowed to be an instance method.
+
+    _extract_from_webpage may raise self.StopExtraction() to stop further
+    processing of the webpage and obtain exclusive rights to it. This is useful
+    when the extractor cannot reliably be matched using just the URL.
+    Eg: invidious/peertube instances
+
+    Embed-only extractors can be defined by setting _VALID_URL = False.
+
      To support username + password (or netrc) login, the extractor must define a
      _NETRC_MACHINE and re-define _perform_login(username, password) and
      (optionally) _initialize_pre_login() methods. The _perform_login method will
@@ -476,6 +491,8 @@ class InfoExtractor:
      _NETRC_MACHINE = None
      IE_DESC = None
      SEARCH_KEY = None
+    _VALID_URL = None
+    _EMBED_REGEX = []
  
      def _login_hint(self, method=NO_DEFAULT, netrc=None):
          password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
@@ -499,12 +516,12 @@ def __init__(self, downloader=None):
  
      @classmethod
      def _match_valid_url(cls, url):
+        if cls._VALID_URL is False:
+            return None
          # This does not use has/getattr intentionally - we want to know whether
          # we have cached the regexp for *this* class, whereas getattr would also
          # match the superclass
          if '_VALID_URL_RE' not in cls.__dict__:
-            if '_VALID_URL' not in cls.__dict__:
-                cls._VALID_URL = cls._make_valid_url()
              cls._VALID_URL_RE = re.compile(cls._VALID_URL)
          return cls._VALID_URL_RE.match(url)
  
@@ -1143,10 +1160,12 @@ def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent
              'url': url,
          }
  
-    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
-        urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
-                for m in orderedSet(map(getter, matches) if getter else matches))
-        return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
+    @classmethod
+    def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
+                              getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
+        return cls.playlist_result(
+            (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
+            playlist_id, playlist_title, **kwargs)
  
      @staticmethod
      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
@@ -1353,12 +1372,20 @@ def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs
      def _dc_search_uploader(self, html):
          return self._html_search_meta('dc.creator', html, 'uploader')
  
-    def _rta_search(self, html):
+    @staticmethod
+    def _rta_search(html):
          # See http://www.rtalabel.org/index.php?content=howtofaq#single
          if re.search(r'(?ix)<meta\s+name="rating"\s+'
                       r'     content="RTA-5042-1996-1400-1577-RTA"',
                       html):
              return 18
+
+        # And then there are the jokers who advertise that they use RTA, but actually don't.
+        AGE_LIMIT_MARKERS = [
+            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
+        ]
+        if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
+            return 18
          return 0
  
      def _media_rating_search(self, html):
@@ -1965,14 +1992,9 @@ def http_scheme(self):
              else 'https:')
  
      def _proto_relative_url(self, url, scheme=None):
-        if url is None:
-            return url
-        if url.startswith('//'):
-            if scheme is None:
-                scheme = self.http_scheme()
-            return scheme + url
-        else:
-            return url
+        scheme = scheme or self.http_scheme()
+        assert scheme.endswith(':')
+        return sanitize_url(url, scheme=scheme[:-1])
  
      def _sleep(self, timeout, video_id, msg_template=None):
          if msg_template is None:
@@ -3767,10 +3789,12 @@ def geo_verification_headers(self):
              headers['Ytdl-request-proxy'] = geo_verification_proxy
          return headers
  
-    def _generic_id(self, url):
+    @staticmethod
+    def _generic_id(url):
          return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
  
-    def _generic_title(self, url):
+    @staticmethod
+    def _generic_title(url):
          return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
  
      @staticmethod
@@ -3816,6 +3840,37 @@ def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_l
          self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
          return True
  
+    @classmethod
+    def extract_from_webpage(cls, ydl, url, webpage):
+        ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
+              else ydl.get_info_extractor(cls.ie_key()))
+        yield from ie._extract_from_webpage(url, webpage) or []
+
+    @classmethod
+    def _extract_from_webpage(cls, url, webpage):
+        for embed_url in orderedSet(
+                cls._extract_embed_urls(url, webpage) or [], lazy=True):
+            yield cls.url_result(embed_url, cls)
+
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        """@returns all the embed urls on the webpage"""
+        if '_EMBED_URL_RE' not in cls.__dict__:
+            assert isinstance(cls._EMBED_REGEX, (list, tuple))
+            for idx, regex in enumerate(cls._EMBED_REGEX):
+                assert regex.count('(?P<url>') == 1, \
+                    f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
+            cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
+
+        for regex in cls._EMBED_URL_RE:
+            for mobj in regex.finditer(webpage):
+                embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
+                if cls._VALID_URL is False or cls.suitable(embed_url):
+                    yield embed_url
+
+    class StopExtraction(Exception):
+        pass
+
  
  class SearchInfoExtractor(InfoExtractor):
      """
@@ -3826,8 +3881,8 @@ class SearchInfoExtractor(InfoExtractor):
  
      _MAX_RESULTS = float('inf')
  
-    @classmethod
-    def _make_valid_url(cls):
+    @classproperty
+    def _VALID_URL(cls):
          return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
  
      def _real_extract(self, query):
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index f8311820e6b9d5a357fde60421374445c095340b..d6a6166a0a717be40195a5996a1849c587e10013 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -3,6 +3,8 @@
  import urllib.parse
  import xml.etree.ElementTree
  
+from . import gen_extractor_classes
+from .common import InfoExtractor  # isort: split
  from .ant1newsgr import Ant1NewsGrEmbedIE
  from .anvato import AnvatoIE
  from .apa import APAIE
@@ -14,7 +16,6 @@
  from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
  from .channel9 import Channel9IE
  from .cloudflarestream import CloudflareStreamIE
-from .common import InfoExtractor
  from .commonprotocols import RtmpIE
  from .condenast import CondeNastIE
  from .dailymail import DailyMailIE
@@ -115,6 +116,7 @@
      determine_ext,
      dict_get,
      float_or_none,
+    format_field,
      int_or_none,
      is_html,
      js_to_json,
@@ -2641,8 +2643,15 @@ def report_following_redirect(self, new_url):
          """Report information extraction."""
          self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
  
-    def report_detected(self, name):
-        self._downloader.write_debug(f'Identified a {name}')
+    def report_detected(self, name, num=1, note=None):
+        if num > 1:
+            name += 's'
+        elif not num:
+            return
+        else:
+            num = 'a'
+
+        self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
  
      def _extract_rss(self, url, video_id, doc):
          NS_MAP = {
@@ -2854,8 +2863,7 @@ def _real_extract(self, url):
  
          if not self.get_param('test', False) and not is_intentional:
              force = self.get_param('force_generic_extractor', False)
-            self.report_warning(
-                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
+            self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
  
          first_bytes = full_response.read(512)
  
@@ -2933,6 +2941,22 @@ def _real_extract(self, url):
              self.report_detected('Camtasia video')
              return camtasia_res
  
+        info_dict.update({
+            # it's tempting to parse this further, but you would
+            # have to take into account all the variations like
+            #   Video Title - Site Name
+            #   Site Name | Video Title
+            #   Video Title - Tagline | Site Name
+            # and so on and so forth; it's just not practical
+            'title': (self._og_search_title(webpage, default=None)
+                      or self._html_extract_title(webpage, 'video title', default='video')),
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'age_limit': self._rta_search(webpage),
+        })
+
+        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
+
          # Sometimes embedded video player is hidden behind percent encoding
          # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
          # Unescaping the whole page allows to handle those cases in a generic way
@@ -2946,40 +2970,12 @@ def _real_extract(self, url):
              r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
              lambda x: unescapeHTML(x.group(0)), webpage)
  
-        # it's tempting to parse this further, but you would
-        # have to take into account all the variations like
-        #   Video Title - Site Name
-        #   Site Name | Video Title
-        #   Video Title - Tagline | Site Name
-        # and so on and so forth; it's just not practical
-        video_title = (self._og_search_title(webpage, default=None)
-                       or self._html_extract_title(webpage, 'video title', default='video'))
-
-        # Try to detect age limit automatically
-        age_limit = self._rta_search(webpage)
-        # And then there are the jokers who advertise that they use RTA,
-        # but actually don't.
-        AGE_LIMIT_MARKERS = [
-            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
-        ]
-        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
-            age_limit = 18
-
-        # video uploader is domain name
-        video_uploader = self._search_regex(
-            r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
-
-        video_description = self._og_search_description(webpage, default=None)
-        video_thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        info_dict.update({
-            'title': video_title,
-            'description': video_description,
-            'thumbnail': video_thumbnail,
-            'age_limit': age_limit,
-        })
+        # TODO: Remove
+        video_title, video_description, video_thumbnail, age_limit, video_uploader = \
+            info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
  
-        self._downloader.write_debug('Looking for video embeds')
+        # TODO: Move Embeds
+        self._downloader.write_debug('Looking for single embeds')
  
          # Look for Brightcove Legacy Studio embeds
          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
@@ -2998,7 +2994,7 @@ def _real_extract(self, url):
              }
  
          # Look for Brightcove New Studio embeds
-        bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
+        bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
          if bc_urls:
              return self.playlist_from_matches(
                  bc_urls, video_id, video_title,
@@ -3246,7 +3242,7 @@ def _real_extract(self, url):
              return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
  
          # Look for embedded Spotify player
-        spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
+        spotify_urls = SpotifyBaseIE._extract_urls(webpage)
          if spotify_urls:
              return self.playlist_from_matches(spotify_urls, video_id, video_title)
  
@@ -3837,6 +3833,30 @@ def _real_extract(self, url):
          tiktok_urls = TikTokIE._extract_urls(webpage)
          if tiktok_urls:
              return self.playlist_from_matches(tiktok_urls, video_id, video_title)
+        # TODO: END: Move Embeds
+
+        self._downloader.write_debug('Looking for embeds')
+        embeds = []
+        for ie in gen_extractor_classes():
+            gen = ie.extract_from_webpage(self._downloader, url, webpage)
+            current_embeds = []
+            try:
+                while True:
+                    current_embeds.append(next(gen))
+            except self.StopExtraction:
+                self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
+                                     embeds and 'discarding other embeds')
+                embeds = current_embeds
+                break
+            except StopIteration:
+                self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
+                embeds.extend(current_embeds)
+
+        del current_embeds
+        if len(embeds) == 1:
+            return {**info_dict, **embeds[0]}
+        elif embeds:
+            return self.playlist_result(embeds, **info_dict)
  
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@@ -4119,7 +4139,6 @@ def filter_video(urls):
                  entries.append(self.url_result(video_url, 'Youtube'))
                  continue
  
-            # here's a fun little line of code for you:
              video_id = os.path.splitext(video_id)[0]
              headers = {
                  'referer': full_response.geturl()
diff --git a/yt_dlp/extractor/spotify.py b/yt_dlp/extractor/spotify.py

index fef8d8dd20d5011fd9f3f6d0dc924afe986c1e31..f476b7022c7e787724a8f27502f6039af86fe688 100644 (file)
--- a/yt_dlp/extractor/spotify.py
+++ b/yt_dlp/extractor/spotify.py
@@ -98,7 +98,7 @@ def _extract_episode(self, episode, series):
          }
  
      @classmethod
-    def _extract_embed_urls(cls, webpage):
+    def _extract_urls(cls, webpage):
          return re.findall(
              r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
              webpage)
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 57c9961c1a8af34478d20e41f1a00eefbd03754b..545c027635da2213809c01b746155ca0e3e436d4 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -705,13 +705,13 @@ def sanitize_path(s, force=False):
      return os.path.join(*sanitized_path)
  
  
-def sanitize_url(url):
+def sanitize_url(url, *, scheme='http'):
      # Prepend protocol-less URLs with `http:` scheme in order to mitigate
      # the number of unwanted failures due to missing protocol
      if url is None:
          return
      elif url.startswith('//'):
-        return 'http:%s' % url
+        return f'{scheme}:{url}'
      # Fix some common typos seen so far
      COMMON_TYPOS = (
          # https://github.com/ytdl-org/youtube-dl/issues/15649
author	pukkandan <redacted>
	Mon, 1 Aug 2022 01:22:03 +0000 (06:52 +0530)
committer	pukkandan <redacted>
	Mon, 1 Aug 2022 19:38:16 +0000 (01:08 +0530)
devscripts/lazy_load_template.py		patch \| blob \| blame \| history
devscripts/make_lazy_extractors.py		patch \| blob \| blame \| history
yt_dlp/YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/extractor/brightcove.py		patch \| blob \| blame \| history
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/generic.py		patch \| blob \| blame \| history
yt_dlp/extractor/spotify.py		patch \| blob \| blame \| history
yt_dlp/utils.py		patch \| blob \| blame \| history