[extractor] Framework for embed detection (#4307)

[yt-dlp.git] / yt_dlp / extractor / generic.py
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index 36e82ca04b6129b736e3ecf34e9e659226895463..d6a6166a0a717be40195a5996a1849c587e10013 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -1,7 +1,10 @@
  import os
  import re
+import urllib.parse
  import xml.etree.ElementTree
  
+from . import gen_extractor_classes
+from .common import InfoExtractor  # isort: split
  from .ant1newsgr import Ant1NewsGrEmbedIE
  from .anvato import AnvatoIE
  from .apa import APAIE
@@ -13,7 +16,6 @@
  from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
  from .channel9 import Channel9IE
  from .cloudflarestream import CloudflareStreamIE
-from .common import InfoExtractor
  from .commonprotocols import RtmpIE
  from .condenast import CondeNastIE
  from .dailymail import DailyMailIE
@@ -69,6 +71,7 @@
  from .sportbox import SportBoxIE
  from .spotify import SpotifyBaseIE
  from .springboardplatform import SpringboardPlatformIE
+from .substack import SubstackIE
  from .svt import SVTIE
  from .teachable import TeachableIE
  from .ted import TedEmbedIE
@@ -105,20 +108,15 @@
  from .youporn import YouPornIE
  from .youtube import YoutubeIE
  from .zype import ZypeIE
-from ..compat import (
-    compat_etree_fromstring,
-    compat_str,
-    compat_urllib_parse_unquote,
-    compat_urlparse,
-)
+from ..compat import compat_etree_fromstring
  from ..utils import (
      KNOWN_EXTENSIONS,
      ExtractorError,
-    HEADRequest,
      UnsupportedError,
      determine_ext,
      dict_get,
      float_or_none,
+    format_field,
      int_or_none,
      is_html,
      js_to_json,
@@ -127,7 +125,6 @@
      orderedSet,
      parse_duration,
      parse_resolution,
-    sanitized_Request,
      smuggle_url,
      str_or_none,
      try_call,
@@ -145,7 +142,7 @@ class GenericIE(InfoExtractor):
      IE_DESC = 'Generic downloader that works on some sites'
      _VALID_URL = r'.*'
      IE_NAME = 'generic'
-    _NETRC_MACHINE = False  # Supress username warning
+    _NETRC_MACHINE = False  # Suppress username warning
      _TESTS = [
          # Direct link to a video
          {
@@ -2542,7 +2539,34 @@ class GenericIE(InfoExtractor):
                  'timestamp': 1652833414,
                  'age_limit': 0,
              }
-        }, {
+        },
+        {
+            'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details',
+            'md5': '198bde8bed23d0b23c70725c83c9b6d9',
+            'info_dict': {
+                'id': '53602801',
+                'ext': 'mpga',
+                'title': 'Interstellar',
+                'description': 'Listen now | Episode One',
+                'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538',
+                'uploader': 'Molly Movie Club',
+                'uploader_id': '839621',
+            },
+        },
+        {
+            'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r',
+            'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0',
+            'info_dict': {
+                'id': '57962052',
+                'ext': 'mpga',
+                'title': 'md5:855b2756f0ee10f6723fa00b16266f8d',
+                'description': 'md5:fe512a5e94136ad260c80bde00ea4eef',
+                'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59',
+                'uploader': 'Blocked and Reported',
+                'uploader_id': '500230',
+            },
+        },
+        {
              'url': 'https://www.skimag.com/video/ski-people-1980/',
              'info_dict': {
                  'id': 'ski-people-1980',
@@ -2563,14 +2587,71 @@ class GenericIE(InfoExtractor):
                  }
              }]
          },
+        {
+            'note': 'Rumble embed',
+            'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
+            'md5': '53af34098a7f92c4e51cf0bd1c33f009',
+            'info_dict': {
+                'id': 'vb0ofn',
+                'ext': 'mp4',
+                'timestamp': 1612662578,
+                'uploader': 'LovingMontana',
+                'channel': 'LovingMontana',
+                'upload_date': '20210207',
+                'title': 'Winter-loving dog helps girls dig a snow fort ',
+                'channel_url': 'https://rumble.com/c/c-546523',
+                'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg',
+                'duration': 103,
+            }
+        },
+        {
+            'note': 'Rumble JS embed',
+            'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it',
+            'md5': '4701209ac99095592e73dbba21889690',
+            'info_dict': {
+                'id': 'v15eqxl',
+                'ext': 'mp4',
+                'channel': 'Mr Producer Media',
+                'duration': 92,
+                'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
+                'channel_url': 'https://rumble.com/c/RichSementa',
+                'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg',
+                'timestamp': 1654892716,
+                'uploader': 'Mr Producer Media',
+                'upload_date': '20220610',
+            }
+        },
+        {
+            'note': 'JSON LD with multiple @type',
+            'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
+            'md5': 'c7949f34f57273013fb7ccb1156393db',
+            'info_dict': {
+                'id': 'ipy2AcGL',
+                'ext': 'mp4',
+                'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
+                'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg',
+                'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
+                'timestamp': 1586577474,
+                'upload_date': '20200411',
+                'age_limit': 0,
+                'duration': 111.0,
+            }
+        },
      ]
  
      def report_following_redirect(self, new_url):
          """Report information extraction."""
          self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
  
-    def report_detected(self, name):
-        self._downloader.write_debug(f'Identified a {name}')
+    def report_detected(self, name, num=1, note=None):
+        if num > 1:
+            name += 's'
+        elif not num:
+            return
+        else:
+            num = 'a'
+
+        self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
  
      def _extract_rss(self, url, video_id, doc):
          NS_MAP = {
@@ -2625,7 +2706,7 @@ def _extract_camtasia(self, url, video_id, webpage):
  
          title = self._html_search_meta('DC.title', webpage, fatal=True)
  
-        camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
+        camtasia_url = urllib.parse.urljoin(url, camtasia_cfg)
          camtasia_cfg = self._download_xml(
              camtasia_url, video_id,
              note='Downloading camtasia configuration',
@@ -2641,7 +2722,7 @@ def _extract_camtasia(self, url, video_id, webpage):
              entries.append({
                  'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
                  'title': f'{title} - {n.tag}',
-                'url': compat_urlparse.urljoin(url, url_n.text),
+                'url': urllib.parse.urljoin(url, url_n.text),
                  'duration': float_or_none(n.find('./duration').text),
              })
  
@@ -2693,7 +2774,7 @@ def _real_extract(self, url):
          if url.startswith('//'):
              return self.url_result(self.http_scheme() + url)
  
-        parsed_url = compat_urlparse.urlparse(url)
+        parsed_url = urllib.parse.urlparse(url)
          if not parsed_url.scheme:
              default_search = self.get_param('default_search')
              if default_search is None:
@@ -2733,43 +2814,34 @@ def _real_extract(self, url):
          else:
              video_id = self._generic_id(url)
  
-        self.to_screen('%s: Requesting header' % video_id)
-
-        head_req = HEADRequest(url)
-        head_response = self._request_webpage(
-            head_req, video_id,
-            note=False, errnote='Could not send HEAD request to %s' % url,
-            fatal=False)
-
-        if head_response is not False:
-            # Check for redirect
-            new_url = head_response.geturl()
-            if url != new_url:
-                self.report_following_redirect(new_url)
-                if force_videoid:
-                    new_url = smuggle_url(
-                        new_url, {'force_videoid': force_videoid})
-                return self.url_result(new_url)
-
-        full_response = None
-        if head_response is False:
-            request = sanitized_Request(url)
-            request.add_header('Accept-Encoding', '*')
-            full_response = self._request_webpage(request, video_id)
-            head_response = full_response
+        # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+        # making it impossible to download only chunk of the file (yet we need only 512kB to
+        # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
+        # that will always result in downloading the whole file that is not desirable.
+        # Therefore for extraction pass we have to override Accept-Encoding to any in order
+        # to accept raw bytes and being able to download only a chunk.
+        # It may probably better to solve this by checking Content-Type for application/octet-stream
+        # after a HEAD request, but not sure if we can rely on this.
+        full_response = self._request_webpage(url, video_id, headers={'Accept-Encoding': '*'})
+        new_url = full_response.geturl()
+        if url != new_url:
+            self.report_following_redirect(new_url)
+            if force_videoid:
+                new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
+            return self.url_result(new_url)
  
          info_dict = {
              'id': video_id,
              'title': self._generic_title(url),
-            'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
+            'timestamp': unified_timestamp(full_response.headers.get('Last-Modified'))
          }
  
          # Check for direct link to a video
-        content_type = head_response.headers.get('Content-Type', '').lower()
+        content_type = full_response.headers.get('Content-Type', '').lower()
          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
          if m:
              self.report_detected('direct video link')
-            format_id = compat_str(m.group('format_id'))
+            format_id = str(m.group('format_id'))
              subtitles = {}
              if format_id.endswith('mpegurl'):
                  formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
@@ -2791,21 +2863,7 @@ def _real_extract(self, url):
  
          if not self.get_param('test', False) and not is_intentional:
              force = self.get_param('force_generic_extractor', False)
-            self.report_warning(
-                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
-
-        if not full_response:
-            request = sanitized_Request(url)
-            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
-            # making it impossible to download only chunk of the file (yet we need only 512kB to
-            # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
-            # that will always result in downloading the whole file that is not desirable.
-            # Therefore for extraction pass we have to override Accept-Encoding to any in order
-            # to accept raw bytes and being able to download only a chunk.
-            # It may probably better to solve this by checking Content-Type for application/octet-stream
-            # after HEAD request finishes, but not sure if we can rely on this.
-            request.add_header('Accept-Encoding', '*')
-            full_response = self._request_webpage(request, video_id)
+            self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
  
          first_bytes = full_response.read(512)
  
@@ -2883,12 +2941,28 @@ def _real_extract(self, url):
              self.report_detected('Camtasia video')
              return camtasia_res
  
+        info_dict.update({
+            # it's tempting to parse this further, but you would
+            # have to take into account all the variations like
+            #   Video Title - Site Name
+            #   Site Name | Video Title
+            #   Video Title - Tagline | Site Name
+            # and so on and so forth; it's just not practical
+            'title': (self._og_search_title(webpage, default=None)
+                      or self._html_extract_title(webpage, 'video title', default='video')),
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'age_limit': self._rta_search(webpage),
+        })
+
+        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
+
          # Sometimes embedded video player is hidden behind percent encoding
          # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
          # Unescaping the whole page allows to handle those cases in a generic way
          # FIXME: unescaping the whole page may break URLs, commenting out for now.
          # There probably should be a second run of generic extractor on unescaped webpage.
-        # webpage = compat_urllib_parse_unquote(webpage)
+        # webpage = urllib.parse.unquote(webpage)
  
          # Unescape squarespace embeds to be detected by generic extractor,
          # see https://github.com/ytdl-org/youtube-dl/issues/21294
@@ -2896,40 +2970,12 @@ def _real_extract(self, url):
              r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
              lambda x: unescapeHTML(x.group(0)), webpage)
  
-        # it's tempting to parse this further, but you would
-        # have to take into account all the variations like
-        #   Video Title - Site Name
-        #   Site Name | Video Title
-        #   Video Title - Tagline | Site Name
-        # and so on and so forth; it's just not practical
-        video_title = (self._og_search_title(webpage, default=None)
-                       or self._html_extract_title(webpage, 'video title', default='video'))
-
-        # Try to detect age limit automatically
-        age_limit = self._rta_search(webpage)
-        # And then there are the jokers who advertise that they use RTA,
-        # but actually don't.
-        AGE_LIMIT_MARKERS = [
-            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
-        ]
-        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
-            age_limit = 18
-
-        # video uploader is domain name
-        video_uploader = self._search_regex(
-            r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
-
-        video_description = self._og_search_description(webpage, default=None)
-        video_thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        info_dict.update({
-            'title': video_title,
-            'description': video_description,
-            'thumbnail': video_thumbnail,
-            'age_limit': age_limit,
-        })
+        # TODO: Remove
+        video_title, video_description, video_thumbnail, age_limit, video_uploader = \
+            info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
  
-        self._downloader.write_debug('Looking for video embeds')
+        # TODO: Move Embeds
+        self._downloader.write_debug('Looking for single embeds')
  
          # Look for Brightcove Legacy Studio embeds
          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
@@ -2948,7 +2994,7 @@ def _real_extract(self, url):
              }
  
          # Look for Brightcove New Studio embeds
-        bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
+        bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
          if bc_urls:
              return self.playlist_from_matches(
                  bc_urls, video_id, video_title,
@@ -3038,6 +3084,7 @@ def _real_extract(self, url):
          wistia_urls = WistiaIE._extract_urls(webpage)
          if wistia_urls:
              playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
+            playlist['entries'] = list(playlist['entries'])
              for entry in playlist['entries']:
                  entry.update({
                      '_type': 'url_transparent',
@@ -3057,6 +3104,11 @@ def _real_extract(self, url):
              # Don't set the extractor because it can be a track url or an album
              return self.url_result(burl)
  
+        # Check for Substack custom domains
+        substack_url = SubstackIE._extract_url(webpage, url)
+        if substack_url:
+            return self.url_result(substack_url, SubstackIE)
+
          # Look for embedded Vevo player
          mobj = re.search(
              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
@@ -3155,7 +3207,7 @@ def _real_extract(self, url):
              return self.url_result(mobj.group('url'))
          mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
          if mobj is not None:
-            return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
+            return self.url_result(urllib.parse.unquote(mobj.group('url')))
  
          # Look for funnyordie embed
          matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -3190,7 +3242,7 @@ def _real_extract(self, url):
              return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
  
          # Look for embedded Spotify player
-        spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
+        spotify_urls = SpotifyBaseIE._extract_urls(webpage)
          if spotify_urls:
              return self.playlist_from_matches(spotify_urls, video_id, video_title)
  
@@ -3408,7 +3460,7 @@ def _real_extract(self, url):
              r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
          if mobj is not None:
              return self.url_result(
-                compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
+                urllib.parse.urljoin(url, mobj.group('url')), 'UDNEmbed')
  
          # Look for Senate ISVP iframe
          senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
@@ -3641,7 +3693,7 @@ def _real_extract(self, url):
          if mediasite_urls:
              entries = [
                  self.url_result(smuggle_url(
-                    compat_urlparse.urljoin(url, mediasite_url),
+                    urllib.parse.urljoin(url, mediasite_url),
                      {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
                  for mediasite_url in mediasite_urls]
              return self.playlist_result(entries, video_id, video_title)
@@ -3781,6 +3833,30 @@ def _real_extract(self, url):
          tiktok_urls = TikTokIE._extract_urls(webpage)
          if tiktok_urls:
              return self.playlist_from_matches(tiktok_urls, video_id, video_title)
+        # TODO: END: Move Embeds
+
+        self._downloader.write_debug('Looking for embeds')
+        embeds = []
+        for ie in gen_extractor_classes():
+            gen = ie.extract_from_webpage(self._downloader, url, webpage)
+            current_embeds = []
+            try:
+                while True:
+                    current_embeds.append(next(gen))
+            except self.StopExtraction:
+                self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
+                                     embeds and 'discarding other embeds')
+                embeds = current_embeds
+                break
+            except StopIteration:
+                self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
+                embeds.extend(current_embeds)
+
+        del current_embeds
+        if len(embeds) == 1:
+            return {**info_dict, **embeds[0]}
+        elif embeds:
+            return self.playlist_result(embeds, **info_dict)
  
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@@ -3836,11 +3912,11 @@ def _real_extract(self, url):
              subtitles = {}
              for source in sources:
                  src = source.get('src')
-                if not src or not isinstance(src, compat_str):
+                if not src or not isinstance(src, str):
                      continue
-                src = compat_urlparse.urljoin(url, src)
+                src = urllib.parse.urljoin(url, src)
                  src_type = source.get('type')
-                if isinstance(src_type, compat_str):
+                if isinstance(src_type, str):
                      src_type = src_type.lower()
                  ext = determine_ext(src).lower()
                  if src_type == 'video/youtube':
@@ -3874,7 +3950,7 @@ def _real_extract(self, url):
                  if not src:
                      continue
                  subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
-                    'url': compat_urlparse.urljoin(url, src),
+                    'url': urllib.parse.urljoin(url, src),
                      'name': sub.get('label'),
                      'http_headers': {
                          'Referer': full_response.geturl(),
@@ -3891,22 +3967,17 @@ def _real_extract(self, url):
          json_ld = self._search_json_ld(webpage, video_id, default={})
          if json_ld.get('url') not in (url, None):
              self.report_detected('JSON LD')
-            if determine_ext(json_ld['url']) == 'm3u8':
-                json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles(
-                    json_ld['url'], video_id, 'mp4')
-                json_ld.pop('url')
-                self._sort_formats(json_ld['formats'])
-            else:
-                json_ld['_type'] = 'url_transparent'
-                json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True})
-            return merge_dicts(json_ld, info_dict)
+            return merge_dicts({
+                '_type': 'url_transparent',
+                'url': smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}),
+            }, json_ld, info_dict)
  
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):
                  return True
              if RtmpIE.suitable(vurl):
                  return True
-            vpath = compat_urlparse.urlparse(vurl).path
+            vpath = urllib.parse.urlparse(vurl).path
              vext = determine_ext(vpath, None)
              return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
  
@@ -4030,11 +4101,11 @@ def filter_video(urls):
                  webpage)
              if not found:
                  # Look also in Refresh HTTP header
-                refresh_header = head_response.headers.get('Refresh')
+                refresh_header = full_response.headers.get('Refresh')
                  if refresh_header:
                      found = re.search(REDIRECT_REGEX, refresh_header)
              if found:
-                new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
+                new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
                  if new_url != url:
                      self.report_following_redirect(new_url)
                      return {
@@ -4060,15 +4131,14 @@ def filter_video(urls):
          for video_url in orderedSet(found):
              video_url = unescapeHTML(video_url)
              video_url = video_url.replace('\\/', '/')
-            video_url = compat_urlparse.urljoin(url, video_url)
-            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
+            video_url = urllib.parse.urljoin(url, video_url)
+            video_id = urllib.parse.unquote(os.path.basename(video_url))
  
              # Sometimes, jwplayer extraction will result in a YouTube URL
              if YoutubeIE.suitable(video_url):
                  entries.append(self.url_result(video_url, 'Youtube'))
                  continue
  
-            # here's a fun little line of code for you:
              video_id = os.path.splitext(video_id)[0]
              headers = {
                  'referer': full_response.geturl()