[extractors] Use new framework for existing embeds (#4307)

author pukkandan <redacted>

Mon, 1 Aug 2022 01:23:25 +0000 (06:53 +0530)

committer pukkandan <redacted>

Mon, 1 Aug 2022 19:38:16 +0000 (01:08 +0530)
author pukkandan <redacted>
Mon, 1 Aug 2022 01:23:25 +0000 (06:53 +0530)
committer pukkandan <redacted>
Mon, 1 Aug 2022 19:38:16 +0000 (01:08 +0530)
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py

index 221c1598dfb5c3dccfa0826e809f6b007268fc2f..5ca92f18b2e77a6b38bf5bdc74d1abb9c9f6d51b 100644 (file)
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -446,7 +446,7 @@
      DWIE,
      DWArticleIE,
  )
-from .eagleplatform import EaglePlatformIE
+from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE
  from .ebaumsworld import EbaumsWorldIE
  from .echomsk import EchoMskIE
  from .egghead import (
@@ -1555,6 +1555,7 @@
      SharedIE,
      VivoIE,
  )
+from .sharevideos import ShareVideosEmbedIE
  from .shemaroome import ShemarooMeIE
  from .showroomlive import ShowRoomLiveIE
  from .simplecast import (
diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py

index 941254243f2124305db3cc64c226ebfb9ea07456..d8e07b3a176c4d6d5dd63bac0e09ed2b730d0268 100644 (file)
--- a/yt_dlp/extractor/adobetv.py
+++ b/yt_dlp/extractor/adobetv.py
@@ -232,6 +232,7 @@ def _real_extract(self, url):
  class AdobeTVVideoIE(AdobeTVBaseIE):
      IE_NAME = 'adobetv:video'
      _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]']
  
      _TEST = {
          # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py

index cd0f3685691708090a5ec87e6042dc90624da8d3..fac476e21a4ba7a222b5ef0d536558139df5853a 100644 (file)
--- a/yt_dlp/extractor/ant1newsgr.py
+++ b/yt_dlp/extractor/ant1newsgr.py
@@ -1,4 +1,3 @@
-import re
  import urllib.parse
  
  from .common import InfoExtractor
@@ -7,7 +6,6 @@
      ExtractorError,
      determine_ext,
      scale_thumbnails_to_max_format_width,
-    unescapeHTML,
  )
  
  
@@ -91,7 +89,7 @@ def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
          info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
-        embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
+        embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
          if not embed_urls:
              raise ExtractorError('no videos found for %s' % video_id, expected=True)
          return self.playlist_from_matches(
@@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
      IE_DESC = 'ant1news.gr embedded videos'
      _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
      _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
+    _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
      _API_PATH = '/news/templates/data/jsonPlayer'
  
      _TESTS = [{
@@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
          },
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
-        _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
-        for mobj in re.finditer(_EMBED_RE, webpage):
-            url = unescapeHTML(mobj.group('url'))
-            if not cls.suitable(url):
-                continue
-            yield url
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py

index 09dfffdb098c48c10bf057e011b65c41c4f18709..cb94835693a56b9f101cf1ba071089770dbeefeb 100644 (file)
--- a/yt_dlp/extractor/anvato.py
+++ b/yt_dlp/extractor/anvato.py
@@ -340,30 +340,16 @@ def _get_anvato_videos(self, access_key, video_id):
              'subtitles': subtitles,
          }
  
-    @staticmethod
-    def _extract_urls(ie, webpage, video_id):
-        entries = []
-        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
-            anvplayer_data = ie._parse_json(
-                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
-                fatal=False)
-            if not anvplayer_data:
-                continue
-            video = anvplayer_data.get('video')
-            if not isinstance(video, compat_str) or not video.isdigit():
-                continue
-            access_key = anvplayer_data.get('accessKey')
-            if not access_key:
-                mcp = anvplayer_data.get('mcp')
-                if mcp:
-                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
-                        mcp.lower())
+    @classmethod
+    def _extract_from_webpage(cls, url, webpage):
+        for mobj in re.finditer(cls._ANVP_RE, webpage):
+            anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
+            video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
              if not access_key:
+                access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
+            if not (video_id or '').isdigit() or not access_key:
                  continue
-            entries.append(ie.url_result(
-                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
-                video_id=video))
-        return entries
+            yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id)
  
      def _extract_anvato_videos(self, webpage, video_id):
          anvplayer_data = self._parse_json(
diff --git a/yt_dlp/extractor/apa.py b/yt_dlp/extractor/apa.py

index 847be6edf7269cd583b8fd8fedcdd656d9d7c12b..c9147e855a365840e814b9779da22f5e5affd7cb 100644 (file)
--- a/yt_dlp/extractor/apa.py
+++ b/yt_dlp/extractor/apa.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
@@ -10,6 +8,7 @@
  
  class APAIE(InfoExtractor):
      _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1']
      _TESTS = [{
          'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
          'md5': '2b12292faeb0a7d930c778c7a5b4759b',
@@ -30,14 +29,6 @@ class APAIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
-                webpage)]
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id, base_url = mobj.group('id', 'base_url')
diff --git a/yt_dlp/extractor/aparat.py b/yt_dlp/extractor/aparat.py

index cd6cd1c7907a0fa8ac343f0d0ab71beacac4a422..90464556dbb41757649de545828c479de6b25063 100644 (file)
--- a/yt_dlp/extractor/aparat.py
+++ b/yt_dlp/extractor/aparat.py
@@ -10,6 +10,7 @@
  
  class AparatIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+    _EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"']
  
      _TESTS = [{
          'url': 'http://www.aparat.com/v/wP8On',
diff --git a/yt_dlp/extractor/arcpublishing.py b/yt_dlp/extractor/arcpublishing.py

index 2e3f3cc5fbc1fd7d9beec20a1e2bdd63c3de904a..de9ccc538304ed52c122b09e386e4c81ede9c3f5 100644 (file)
--- a/yt_dlp/extractor/arcpublishing.py
+++ b/yt_dlp/extractor/arcpublishing.py
@@ -70,8 +70,8 @@ class ArcPublishingIE(InfoExtractor):
          ], 'video-api-cdn.%s.arcpublishing.com/api'),
      ]
  
-    @staticmethod
-    def _extract_urls(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          entries = []
          # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
          for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
diff --git a/yt_dlp/extractor/arkena.py b/yt_dlp/extractor/arkena.py

index 9da2bfd5e0591de219c401e2aab04cc493d01adf..9a0273e2c32d79d06fa036332c43d80e259e49e4 100644 (file)
--- a/yt_dlp/extractor/arkena.py
+++ b/yt_dlp/extractor/arkena.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      ExtractorError,
@@ -19,6 +17,8 @@ class ArkenaIE(InfoExtractor):
                                  play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
                              )
                          '''
+    # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1']
      _TESTS = [{
          'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310',
          'md5': '97f117754e5f3c020f5f26da4a44ebaf',
@@ -50,15 +50,6 @@ class ArkenaIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('id')
diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py

index 9ec5203f192bf7b4a8ca365553340dd03446f2ce..980d37849fdeed87ff91b0f2bdea1264dd67091f 100644 (file)
--- a/yt_dlp/extractor/arte.py
+++ b/yt_dlp/extractor/arte.py
@@ -204,6 +204,7 @@ def _real_extract(self, url):
  
  class ArteTVEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
+    _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
      _TESTS = [{
          'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
          'info_dict': {
@@ -219,12 +220,6 @@ class ArteTVEmbedIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [url for _, url in re.findall(
-            r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
-            webpage)]
-
      def _real_extract(self, url):
          qs = parse_qs(url)
          json_url = qs['json_url'][0]
diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py

index 6f806d84ee2432cbfc228f1223465573c5995681..b34fcb10817b7f9554a6b600f22f2df4ae546788 100644 (file)
--- a/yt_dlp/extractor/bandcamp.py
+++ b/yt_dlp/extractor/bandcamp.py
@@ -22,6 +22,7 @@
  
  class BandcampIE(InfoExtractor):
      _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
+    _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
      _TESTS = [{
          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
          'md5': 'c557841d5e50261777a6585648adf439',
diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py

index 4413a299a4353f2d5a4293ec9c581a609852609a..9a0a4414e7e6465b5d4f94b5d0b64f33e85cac81 100644 (file)
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@@ -46,6 +46,7 @@ class BBCCoUkIE(InfoExtractor):
                          )
                          (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                      ''' % _ID_REGEX
+    _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  
      _LOGIN_URL = 'https://account.bbc.com/signin'
      _NETRC_MACHINE = 'bbc'
diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py

index c831092d4c4d0e0986975521247d3f099f200f18..24d321566a8bdc7103879e6157b55da604e9d502 100644 (file)
--- a/yt_dlp/extractor/bitchute.py
+++ b/yt_dlp/extractor/bitchute.py
@@ -13,6 +13,7 @@
  
  class BitChuteIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
+    _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
      _TESTS = [{
          'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
          'md5': '7e427d7ed7af5a75b5855705ec750e2b',
@@ -33,14 +34,6 @@ class BitChuteIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
-                webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/blogger.py b/yt_dlp/extractor/blogger.py

index d7aa7f94ebb915cdba9f9ad28249ff7fe8bbbbc0..3d6e0330429d925342434402d0d61a94a9d3d9a9 100644 (file)
--- a/yt_dlp/extractor/blogger.py
+++ b/yt_dlp/extractor/blogger.py
@@ -1,5 +1,3 @@
-import re
-
  from ..utils import (
      mimetype2ext,
      parse_duration,
@@ -13,7 +11,7 @@
  class BloggerIE(InfoExtractor):
      IE_NAME = 'blogger.com'
      _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)'
-    _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''
+    _EMBED_REGEX = [r'''<iframe[^>]+src=["'](?P<url>(?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''']
      _TESTS = [{
          'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw',
          'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
@@ -26,10 +24,6 @@ class BloggerIE(InfoExtractor):
          }
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(BloggerIE._VALID_EMBED, webpage)
-
      def _real_extract(self, url):
          token_id = self._match_id(url)
          webpage = self._download_webpage(url, token_id)
diff --git a/yt_dlp/extractor/buzzfeed.py b/yt_dlp/extractor/buzzfeed.py

index 1b4cba63e2b0527e73b5eea0032d9fac8f9e3c13..b30a3b7ae25324a1c4f6a802e17f94cfbf69e4f4 100644 (file)
--- a/yt_dlp/extractor/buzzfeed.py
+++ b/yt_dlp/extractor/buzzfeed.py
@@ -81,7 +81,7 @@ def _real_extract(self, url):
                  continue
              entries.append(self.url_result(video['url']))
  
-        facebook_urls = FacebookIE._extract_urls(webpage)
+        facebook_urls = FacebookIE._extract_embed_urls(url, webpage)
          entries.extend([
              self.url_result(facebook_url)
              for facebook_url in facebook_urls])
diff --git a/yt_dlp/extractor/channel9.py b/yt_dlp/extractor/channel9.py

index 90a1ab2bee5225acd8e0ce6c9f35e56c6f11021d..d0390d93716ef7a8628a0e7e1c937893c6221c2a 100644 (file)
--- a/yt_dlp/extractor/channel9.py
+++ b/yt_dlp/extractor/channel9.py
@@ -14,6 +14,7 @@ class Channel9IE(InfoExtractor):
      IE_DESC = 'Channel 9'
      IE_NAME = 'channel9'
      _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b']
  
      _TESTS = [{
          'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
@@ -78,12 +79,6 @@ class Channel9IE(InfoExtractor):
  
      _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b',
-            webpage)
-
      def _extract_list(self, video_id, rss_url=None):
          if not rss_url:
              rss_url = self._RSS_URL % video_id
diff --git a/yt_dlp/extractor/cinchcast.py b/yt_dlp/extractor/cinchcast.py

index 393df36984146e54bfcffcf0f6e7827ee279662a..ff962aad17dbbf2aa28b046fd4c26ed74992a67f 100644 (file)
--- a/yt_dlp/extractor/cinchcast.py
+++ b/yt_dlp/extractor/cinchcast.py
@@ -7,6 +7,8 @@
  
  class CinchcastIE(InfoExtractor):
      _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1']
+
      _TESTS = [{
          'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
          'info_dict': {
diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py

index 0a6073403439d0f388b2ec6451e97a62aa51274f..8bc0ad883d705f4e62a1f204c6494727d64fdba5 100644 (file)
--- a/yt_dlp/extractor/cloudflarestream.py
+++ b/yt_dlp/extractor/cloudflarestream.py
@@ -1,5 +1,4 @@
  import base64
-import re
  
  from .common import InfoExtractor
  
@@ -16,6 +15,7 @@ class CloudflareStreamIE(InfoExtractor):
                          )
                          (?P<id>%s)
                      ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
+    _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1']
      _TESTS = [{
          'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
          'info_dict': {
@@ -37,14 +37,6 @@ class CloudflareStreamIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE),
-                webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net'
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index c91260cb0a6869dce3cd3c084b2e08103172050d..a6933e738b1c41316f76694d62e87853ac619ad3 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -3882,6 +3882,11 @@ def _extract_embed_urls(cls, url, webpage):
      class StopExtraction(Exception):
          pass
  
+    @classmethod
+    def _extract_url(cls, webpage):  # TODO: Remove
+        """Only for compatibility with some older extractors"""
+        return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
+
  
  class SearchInfoExtractor(InfoExtractor):
      """
diff --git a/yt_dlp/extractor/condenast.py b/yt_dlp/extractor/condenast.py

index cf6e40cb835c34d11406f9388433d04c41f86179..ffdd820e2345d980c6c9f54147e8868a58af20ba 100644 (file)
--- a/yt_dlp/extractor/condenast.py
+++ b/yt_dlp/extractor/condenast.py
@@ -58,7 +58,10 @@ class CondeNastIE(InfoExtractor):
          )''' % '|'.join(_SITES.keys())
      IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
  
-    EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
+    _EMBED_REGEX = [r'''(?x)
+        <(?:iframe|script)[^>]+?src=(["\'])(?P<url>
+            (?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?
+        )\1''' % '|'.join(_SITES.keys())]
  
      _TESTS = [{
          'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
diff --git a/yt_dlp/extractor/crooksandliars.py b/yt_dlp/extractor/crooksandliars.py

index c831a3ae08109e627b42e05e25853ff4bd54061f..85c145e12a41df41aa70d7a8848b0d73e6e7cfa9 100644 (file)
--- a/yt_dlp/extractor/crooksandliars.py
+++ b/yt_dlp/extractor/crooksandliars.py
@@ -7,6 +7,8 @@
  
  class CrooksAndLiarsIE(InfoExtractor):
      _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)'
+    _EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1']
+
      _TESTS = [{
          'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi',
          'info_dict': {
diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py

index cb15236179eb5a999510d7f8d8ca6cdc7dec2397..84393627a530a46b25e998e16253bebe63186fb6 100644 (file)
--- a/yt_dlp/extractor/cspan.py
+++ b/yt_dlp/extractor/cspan.py
@@ -163,7 +163,7 @@ def add_referer(formats):
                  video_id = m.group('id')
                  video_type = 'program' if m.group('type') == 'prog' else 'clip'
              else:
-                senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+                senate_isvp_url = SenateISVPIE._extract_url(webpage)
                  if senate_isvp_url:
                      title = self._og_search_title(webpage)
                      surl = smuggle_url(senate_isvp_url, {'force_title': title})
diff --git a/yt_dlp/extractor/dailymail.py b/yt_dlp/extractor/dailymail.py

index 5451dbf00082ed2ae36014b6c23a87fefffe41bb..f25d7a8c6d15bf70fe8e751498bbef06929ff4ad 100644 (file)
--- a/yt_dlp/extractor/dailymail.py
+++ b/yt_dlp/extractor/dailymail.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import compat_str
  from ..utils import (
@@ -12,6 +10,7 @@
  
  class DailyMailIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)'
+    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)']
      _TESTS = [{
          'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',
          'md5': 'f6129624562251f628296c3a9ffde124',
@@ -26,12 +25,6 @@ class DailyMailIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)',
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py

index 46438891f9aec3620f3627add4a34dd92995e7bf..65a9feec5bc039f11466ce9f11a9c4d68d4b239c 100644 (file)
--- a/yt_dlp/extractor/dailymotion.py
+++ b/yt_dlp/extractor/dailymotion.py
@@ -99,6 +99,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                          [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
                      '''
      IE_NAME = 'dailymotion'
+    _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1']
      _TESTS = [{
          'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
          'md5': '074b95bdee76b9e3654137aee9c79dfe',
@@ -208,18 +209,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
        }
        xid'''
  
-    @staticmethod
-    def _extract_urls(webpage):
-        urls = []
-        # Look for embedded Dailymotion player
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          # https://developer.dailymotion.com/player#player-parameters
-        for mobj in re.finditer(
-                r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage):
-            urls.append(unescapeHTML(mobj.group('url')))
+        yield from super()._extract_embed_urls(url, webpage)
          for mobj in re.finditer(
                  r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage):
-            urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id'))
-        return urls
+            yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id')
  
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url)
@@ -378,6 +374,15 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
      }]
      _OBJECT_TYPE = 'collection'
  
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        # Look for embedded Dailymotion playlist player (#3822)
+        for mobj in re.finditer(
+                r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1',
+                webpage):
+            for p in re.findall(r'list\[\]=/playlist/([^/]+)/', unescapeHTML(mobj.group('url'))):
+                yield '//dailymotion.com/playlist/%s' % p
+
  
  class DailymotionUserIE(DailymotionPlaylistBaseIE):
      IE_NAME = 'dailymotion:user'
diff --git a/yt_dlp/extractor/dbtv.py b/yt_dlp/extractor/dbtv.py

index 2beccd8b51566f1912c9c39b6a42eff9b1332898..18be46f7e89ed3694dae77016bfd232df14bb99f 100644 (file)
--- a/yt_dlp/extractor/dbtv.py
+++ b/yt_dlp/extractor/dbtv.py
@@ -1,10 +1,9 @@
-import re
-
  from .common import InfoExtractor
  
  
  class DBTVIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})'
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1']
      _TESTS = [{
          'url': 'https://www.dagbladet.no/video/PynxJnNWChE/',
          'md5': 'b8f850ba1860adbda668d367f9b77699',
@@ -28,12 +27,6 @@ class DBTVIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [url for _, url in re.findall(
-            r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1',
-            webpage)]
-
      def _real_extract(self, url):
          display_id, video_id = self._match_valid_url(url).groups()
          info = {
diff --git a/yt_dlp/extractor/digiteka.py b/yt_dlp/extractor/digiteka.py

index 5d244cb08a0952174ac8f7e13b2e99c672c55409..5fbc42ffe9ef23c512fce4c508a34b9887c8d7f9 100644 (file)
--- a/yt_dlp/extractor/digiteka.py
+++ b/yt_dlp/extractor/digiteka.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import int_or_none
  
@@ -25,6 +23,7 @@ class DigitekaIE(InfoExtractor):
              )
              /id
          )/(?P<id>[\d+a-z]+)'''
+    _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)']
      _TESTS = [{
          # news
          'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
@@ -58,14 +57,6 @@ class DigitekaIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('id')
diff --git a/yt_dlp/extractor/drtuber.py b/yt_dlp/extractor/drtuber.py

index 3149e319f0b00b4b4c11ed318f014a16e4880216..824c2be120577afb46a830441737daef473dd614 100644 (file)
--- a/yt_dlp/extractor/drtuber.py
+++ b/yt_dlp/extractor/drtuber.py
@@ -11,6 +11,7 @@
  
  class DrTuberIE(InfoExtractor):
      _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)']
      _TESTS = [{
          'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf',
          'md5': '93e680cf2536ad0dfb7e74d94a89facd',
@@ -33,12 +34,6 @@ class DrTuberIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)',
-            webpage)
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('id')
diff --git a/yt_dlp/extractor/eagleplatform.py b/yt_dlp/extractor/eagleplatform.py

index e2ecd4b7cf7d80492795d3fdf4408746fc58a6bc..7e5047b560f8ae4808c3d69dea7076aff84deb0e 100644 (file)
--- a/yt_dlp/extractor/eagleplatform.py
+++ b/yt_dlp/extractor/eagleplatform.py
@@ -1,3 +1,4 @@
+import functools
  import re
  
  from .common import InfoExtractor
@@ -5,6 +6,7 @@
  from ..utils import (
      ExtractorError,
      int_or_none,
+    smuggle_url,
      unsmuggle_url,
      url_or_none,
  )
@@ -18,6 +20,7 @@ class EaglePlatformIE(InfoExtractor):
                      )
                      (?P<id>\d+)
                  '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1']
      _TESTS = [{
          # http://lenta.ru/news/2015/03/06/navalny/
          'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
@@ -52,14 +55,14 @@ class EaglePlatformIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        # Regular iframe embedding
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1',
-            webpage)
-        if mobj is not None:
-            return mobj.group('url')
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        add_referer = functools.partial(smuggle_url, data={'referrer': url})
+
+        res = tuple(super()._extract_embed_urls(url, webpage))
+        if res:
+            return map(add_referer, res)
+
          PLAYER_JS_RE = r'''
                          <script[^>]+
                              src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)
@@ -74,7 +77,7 @@ def _extract_url(webpage):
                          data-id=["\'](?P<id>\d+)
              ''' % PLAYER_JS_RE, webpage)
          if mobj is not None:
-            return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
+            return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]
          # Generalization of "Javascript code usage", "Combined usage" and
          # "Usage without attaching to DOM" embeddings (see
          # http://dultonmedia.github.io/eplayer/)
@@ -95,7 +98,7 @@ def _extract_url(webpage):
                      </script>
              ''' % PLAYER_JS_RE, webpage)
          if mobj is not None:
-            return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
+            return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]
  
      @staticmethod
      def _handle_error(response):
@@ -201,3 +204,14 @@ def _real_extract(self, url):
              'age_limit': age_limit,
              'formats': formats,
          }
+
+
+class ClipYouEmbedIE(InfoExtractor):
+    _VALID_URL = False
+
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
+        if mobj is not None:
+            yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url})
diff --git a/yt_dlp/extractor/embedly.py b/yt_dlp/extractor/embedly.py

index a8d1f3c55cda06fb68f014da651308c2eb0b2a61..483d018bb44c8be047acc3ed1e2f2c514cdf7387 100644 (file)
--- a/yt_dlp/extractor/embedly.py
+++ b/yt_dlp/extractor/embedly.py
@@ -1,3 +1,5 @@
+import re
+import urllib.parse
  from .common import InfoExtractor
  from ..compat import compat_urllib_parse_unquote
  
@@ -9,5 +11,14 @@ class EmbedlyIE(InfoExtractor):
          'only_matching': True,
      }]
  
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        # Bypass suitable check
+        for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage):
+            yield mobj.group('url')
+
+        for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage):
+            yield urllib.parse.unquote(mobj.group('url'))
+
      def _real_extract(self, url):
          return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))
diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py

index 276543653d516b6c1b6b8e9164b51afc601de965..eb52ad0310ceb1cc2fee406527a21401a895c599 100644 (file)
--- a/yt_dlp/extractor/ertgr.py
+++ b/yt_dlp/extractor/ertgr.py
@@ -15,7 +15,6 @@
      parse_iso8601,
      str_or_none,
      try_get,
-    unescapeHTML,
      url_or_none,
      variadic,
  )
@@ -275,6 +274,7 @@ class ERTWebtvEmbedIE(InfoExtractor):
      IE_DESC = 'ert.gr webtv embedded videos'
      _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php')
      _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)'
+    _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>(?:https?:)?{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
  
      _TESTS = [{
          'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg',
@@ -287,17 +287,6 @@ class ERTWebtvEmbedIE(InfoExtractor):
          },
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
-        EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)'
-
-        for mobj in re.finditer(EMBED_RE, webpage):
-            url = unescapeHTML(mobj.group('url'))
-            if not cls.suitable(url):
-                continue
-            yield url
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          formats, subs = self._extract_m3u8_formats_and_subtitles(
diff --git a/yt_dlp/extractor/expressen.py b/yt_dlp/extractor/expressen.py

index 5aba21ba7333450960440551b79540ecd2493332..5381e988046624639cccac3aa84e49df68fdeaa7 100644 (file)
--- a/yt_dlp/extractor/expressen.py
+++ b/yt_dlp/extractor/expressen.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
@@ -17,6 +15,7 @@ class ExpressenIE(InfoExtractor):
                          tv/(?:[^/]+/)*
                          (?P<id>[^/?#&]+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1']
      _TESTS = [{
          'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
          'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e',
@@ -45,13 +44,6 @@ class ExpressenIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url') for mobj in re.finditer(
-                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1',
-                webpage)]
-
      def _real_extract(self, url):
          display_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py

index 5b34f3bff3f55c2d423ffb5b59c4f21769336648..d434b359aec594b154724029ed2fcb7f866f65e6 100644 (file)
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -57,6 +57,13 @@ class FacebookIE(InfoExtractor):
                  )
                  (?P<id>[0-9]+)
                  '''
+    _EMBED_REGEX = [
+        r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
+        # Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player
+        r'''(?x)<div[^>]+
+                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
+                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''',
+    ]
      _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
      _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
      _NETRC_MACHINE = 'facebook'
@@ -311,21 +318,6 @@ class FacebookIE(InfoExtractor):
          'graphURI': '/api/graphql/'
      }
  
-    @staticmethod
-    def _extract_urls(webpage):
-        urls = []
-        for mobj in re.finditer(
-                r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
-                webpage):
-            urls.append(mobj.group('url'))
-        # Facebook API embed
-        # see https://developers.facebook.com/docs/plugins/embedded-video-player
-        for mobj in re.finditer(r'''(?x)<div[^>]+
-                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
-                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
-            urls.append(mobj.group('url'))
-        return urls
-
      def _perform_login(self, username, password):
          login_page_req = sanitized_Request(self._LOGIN_URL)
          self._set_cookie('facebook.com', 'locale', 'en_US')
diff --git a/yt_dlp/extractor/foxnews.py b/yt_dlp/extractor/foxnews.py

index e8513f2c2875ab1a9a795f3bad24d01ee77ee6bf..2343dd20d232ac0f6ce09be93751550f81d21ff6 100644 (file)
--- a/yt_dlp/extractor/foxnews.py
+++ b/yt_dlp/extractor/foxnews.py
@@ -56,8 +56,8 @@ class FoxNewsIE(AMPIE):
          },
      ]
  
-    @staticmethod
-    def _extract_urls(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          return [
              f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}'
              for mobj in re.finditer(
@@ -125,4 +125,4 @@ def _real_extract(self, url):
                  'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key())
  
          return self.url_result(
-            FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key())
+            FoxNewsIE._extract_embed_urls(url, webpage)[0], FoxNewsIE.ie_key())
diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py

index 5902eaca07b0291604f5e49e2111d28fc9b55fe3..ba9e691613421de55e0894a63cc885e2bee65ba5 100644 (file)
--- a/yt_dlp/extractor/francetv.py
+++ b/yt_dlp/extractor/francetv.py
@@ -32,6 +32,7 @@ class FranceTVIE(InfoExtractor):
                          (?P<id>[^@]+)(?:@(?P<catalog>.+))?
                      )
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1']
  
      _TESTS = [{
          # without catalog
@@ -370,7 +371,7 @@ def _real_extract(self, url):
  
          webpage = self._download_webpage(url, display_id)
  
-        dailymotion_urls = DailymotionIE._extract_urls(webpage)
+        dailymotion_urls = DailymotionIE._extract_embed_urls(url, webpage)
          if dailymotion_urls:
              return self.playlist_result([
                  self.url_result(dailymotion_url, DailymotionIE.ie_key())
diff --git a/yt_dlp/extractor/gedidigital.py b/yt_dlp/extractor/gedidigital.py

index 4ae5362b4db0550c0ffafbf51f18000cfbd9acdd..4cc67802138e5765859781cb1fbf4ec92fc68744 100644 (file)
--- a/yt_dlp/extractor/gedidigital.py
+++ b/yt_dlp/extractor/gedidigital.py
@@ -11,7 +11,7 @@
  
  
  class GediDigitalIE(InfoExtractor):
-    _VALID_URL = r'''(?x:(?P<url>(?:https?:)//video\.
+    _VALID_URL = r'''(?x:(?P<base_url>(?:https?:)//video\.
          (?:
              (?:
                  (?:espresso\.)?repubblica
@@ -34,6 +34,12 @@ class GediDigitalIE(InfoExtractor):
                  |lasentinella
              )\.gelocal
          )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*))'''
+    _EMBED_REGEX = [rf'''(?x)
+            (?:
+                data-frame-src=|
+                <iframe[^\n]+src=
+            )
+            (["'])(?P<url>{_VALID_URL})\1''']
      _TESTS = [{
          'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683',
          'md5': '84658d7fb9e55a6e57ecc77b73137494',
@@ -109,22 +115,9 @@ def _sanitize_urls(urls):
              urls[i] = urljoin(base_url(e), url_basename(e))
          return urls
  
-    @staticmethod
-    def _extract_urls(webpage):
-        entries = [
-            mobj.group('eurl')
-            for mobj in re.finditer(r'''(?x)
-            (?:
-                data-frame-src=|
-                <iframe[^\n]+src=
-            )
-            (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)]
-        return GediDigitalIE._sanitize_urls(entries)
-
-    @staticmethod
-    def _extract_url(webpage):
-        urls = GediDigitalIE._extract_urls(webpage)
-        return urls[0] if urls else None
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage)))
  
      @staticmethod
      def _clean_formats(formats):
@@ -139,8 +132,7 @@ def _clean_formats(formats):
          formats[:] = clean_formats
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = self._match_valid_url(url).group('url')
+        video_id, url = self._match_valid_url(url).group('id', 'base_url')
          webpage = self._download_webpage(url, video_id)
          title = self._html_search_meta(
              ['twitter:title', 'og:title'], webpage, fatal=True)
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index ec1cbf005f088005545c5ebe18f9ee020564880f..d3ed7ce4610b59346392f0113514071af68b44fd 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -5,109 +5,9 @@
  
  from . import gen_extractor_classes
  from .common import InfoExtractor  # isort: split
-from .ant1newsgr import Ant1NewsGrEmbedIE
-from .anvato import AnvatoIE
-from .apa import APAIE
-from .arcpublishing import ArcPublishingIE
-from .arkena import ArkenaIE
-from .arte import ArteTVEmbedIE
-from .bitchute import BitChuteIE
-from .blogger import BloggerIE
  from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
-from .channel9 import Channel9IE
-from .cloudflarestream import CloudflareStreamIE
  from .commonprotocols import RtmpIE
-from .condenast import CondeNastIE
-from .dailymail import DailyMailIE
-from .dailymotion import DailymotionIE
-from .dbtv import DBTVIE
-from .digiteka import DigitekaIE
-from .drtuber import DrTuberIE
-from .eagleplatform import EaglePlatformIE
-from .ertgr import ERTWebtvEmbedIE
-from .expressen import ExpressenIE
-from .facebook import FacebookIE
-from .foxnews import FoxNewsIE
-from .gedidigital import GediDigitalIE
-from .gfycat import GfycatIE
-from .glomex import GlomexEmbedIE
-from .googledrive import GoogleDriveIE
-from .indavideo import IndavideoEmbedIE
-from .instagram import InstagramIE
-from .joj import JojIE
-from .jwplatform import JWPlatformIE
-from .kaltura import KalturaIE
-from .kinja import KinjaEmbedIE
-from .limelight import LimelightBaseIE
-from .mainstreaming import MainStreamingIE
-from .medialaan import MedialaanIE
-from .mediaset import MediasetIE
-from .mediasite import MediasiteIE
-from .megaphone import MegaphoneIE
-from .megatvcom import MegaTVComEmbedIE
-from .mofosex import MofosexEmbedIE
-from .mtv import MTVServicesEmbeddedIE
-from .myvi import MyviIE
-from .nbc import NBCSportsVPlayerIE
-from .nexx import NexxEmbedIE, NexxIE
-from .odnoklassniki import OdnoklassnikiIE
-from .onionstudios import OnionStudiosIE
-from .ooyala import OoyalaIE
-from .panopto import PanoptoBaseIE
-from .peertube import PeerTubeIE
-from .piksel import PikselIE
-from .pladform import PladformIE
-from .pornhub import PornHubIE
-from .rcs import RCSEmbedsIE
-from .redtube import RedTubeIE
-from .rumble import RumbleEmbedIE
-from .rutube import RutubeIE
-from .rutv import RUTVIE
-from .ruutu import RuutuIE
-from .senategov import SenateISVPIE
-from .simplecast import SimplecastIE
-from .soundcloud import SoundcloudEmbedIE
-from .spankwire import SpankwireIE
-from .sportbox import SportBoxIE
-from .spotify import SpotifyBaseIE
-from .springboardplatform import SpringboardPlatformIE
-from .substack import SubstackIE
-from .svt import SVTIE
-from .teachable import TeachableIE
-from .ted import TedEmbedIE
-from .theplatform import ThePlatformIE
-from .threeqsdn import ThreeQSDNIE
-from .tiktok import TikTokIE
-from .tnaflix import TNAFlixNetworkEmbedIE
-from .tube8 import Tube8IE
-from .tunein import TuneInBaseIE
-from .tvc import TVCIE
-from .tvopengr import TVOpenGrEmbedIE
-from .tvp import TVPEmbedIE
-from .twentymin import TwentyMinutenIE
-from .udn import UDNEmbedIE
-from .ustream import UstreamIE
-from .vbox7 import Vbox7IE
-from .vice import ViceIE
-from .videa import VideaIE
-from .videomore import VideomoreIE
-from .videopress import VideoPressIE
-from .viewlift import ViewLiftEmbedIE
-from .vimeo import VHXEmbedIE, VimeoIE
-from .viqeo import ViqeoIE
-from .vk import VKIE
-from .vshare import VShareIE
-from .vzaar import VzaarIE
-from .washingtonpost import WashingtonPostIE
-from .webcaster import WebcasterFeedIE
-from .wimtv import WimTVIE
-from .wistia import WistiaIE
-from .xfileshare import XFileShareIE
-from .xhamster import XHamsterEmbedIE
-from .yapfiles import YapFilesIE
-from .youporn import YouPornIE
  from .youtube import YoutubeIE
-from .zype import ZypeIE
  from ..compat import compat_etree_fromstring
  from ..utils import (
      KNOWN_EXTENSIONS,
@@ -115,7 +15,6 @@
      UnsupportedError,
      determine_ext,
      dict_get,
-    float_or_none,
      format_field,
      int_or_none,
      is_html,
@@ -1197,7 +1096,7 @@ class GenericIE(InfoExtractor):
                  'timestamp': 468923808,
                  'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4',
              },
-            'add_ie': [JWPlatformIE.ie_key()],
+            'add_ie': ['JWPlatform'],
          },
          {
              # Video.js embed, multiple formats
@@ -1733,7 +1632,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'add_ie': [ArkenaIE.ie_key()],
+            'add_ie': ['Arkena'],
          },
          {
              'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/',
@@ -1745,7 +1644,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'add_ie': [Vbox7IE.ie_key()],
+            'add_ie': ['Vbox7'],
          },
          {
              # DBTV embeds
@@ -1777,7 +1676,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'add_ie': [TwentyMinutenIE.ie_key()],
+            'add_ie': ['TwentyMinuten'],
          },
          {
              # VideoPress embed
@@ -1792,7 +1691,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'add_ie': [VideoPressIE.ie_key()],
+            'add_ie': ['VideoPress'],
          },
          {
              # Rutube embed
@@ -1809,7 +1708,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'add_ie': [RutubeIE.ie_key()],
+            'add_ie': ['Rutube'],
          },
          {
              # glomex:embed
@@ -1881,7 +1780,7 @@ class GenericIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'Integrated Senate Video Player',
              },
-            'add_ie': [SenateISVPIE.ie_key()],
+            'add_ie': ['SenateISVP'],
          },
          {
              # Limelight embeds (1 channel embed + 4 media embeds)
@@ -1928,7 +1827,7 @@ class GenericIE(InfoExtractor):
                  'uploader': 'The Washington Post',
                  'upload_date': '20160211',
              },
-            'add_ie': [WashingtonPostIE.ie_key()],
+            'add_ie': ['WashingtonPost'],
          },
          {
              # Mediaset embed
@@ -1941,7 +1840,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'add_ie': [MediasetIE.ie_key()],
+            'add_ie': ['Mediaset'],
          },
          {
              # JOJ.sk embeds
@@ -1951,7 +1850,7 @@ class GenericIE(InfoExtractor):
                  'title': 'Slovenskom sa prehnala vlna silných búrok',
              },
              'playlist_mincount': 5,
-            'add_ie': [JojIE.ie_key()],
+            'add_ie': ['Joj'],
          },
          {
              # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
@@ -2017,7 +1916,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'add_ie': [SpringboardPlatformIE.ie_key()],
+            'add_ie': ['SpringboardPlatform'],
          },
          {
              'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
@@ -2026,7 +1925,7 @@ class GenericIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'Котята',
              },
-            'add_ie': [YapFilesIE.ie_key()],
+            'add_ie': ['YapFiles'],
              'params': {
                  'skip_download': True,
              },
@@ -2039,7 +1938,7 @@ class GenericIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': '31c9291ab41fac05471db4e73aa11717',
              },
-            'add_ie': [CloudflareStreamIE.ie_key()],
+            'add_ie': ['CloudflareStream'],
              'params': {
                  'skip_download': True,
              },
@@ -2066,7 +1965,7 @@ class GenericIE(InfoExtractor):
                  'uploader': 'StreetKitchen',
                  'uploader_id': '546363',
              },
-            'add_ie': [IndavideoEmbedIE.ie_key()],
+            'add_ie': ['IndavideoEmbed'],
              'params': {
                  'skip_download': True,
              },
@@ -2441,10 +2340,10 @@ class GenericIE(InfoExtractor):
              # Panopto embeds
              'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video',
              'info_dict': {
-                'title': 'Insert a quiz into a Panopto video',
-                'id': 'insert-a-quiz-into-a-panopto-video'
+                'ext': 'mp4',
+                'id': '0bd3f16c-824a-436a-8486-ac5900693aef',
+                'title': 'Quizzes in Panopto',
              },
-            'playlist_count': 1
          },
          {
              # Ruutu embed
@@ -2529,24 +2428,17 @@ class GenericIE(InfoExtractor):
          },
          {
              'url': 'https://www.skimag.com/video/ski-people-1980/',
+            'md5': '022a7e31c70620ebec18deeab376ee03',
              'info_dict': {
-                'id': 'ski-people-1980',
-                'title': 'Ski People (1980)',
-            },
-            'playlist_count': 1,
-            'playlist': [{
-                'md5': '022a7e31c70620ebec18deeab376ee03',
-                'info_dict': {
-                    'id': 'YTmgRiNU',
-                    'ext': 'mp4',
-                    'title': '1980 Ski People',
-                    'timestamp': 1610407738,
-                    'description': 'md5:cf9c3d101452c91e141f292b19fe4843',
-                    'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720',
-                    'duration': 5688.0,
-                    'upload_date': '20210111',
-                }
-            }]
+                'id': 'YTmgRiNU',
+                'ext': 'mp4',
+                'title': '1980 Ski People',
+                'timestamp': 1610407738,
+                'description': 'md5:cf9c3d101452c91e141f292b19fe4843',
+                'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720',
+                'duration': 5688.0,
+                'upload_date': '20210111',
+            }
          },
          {
              'note': 'Rumble embed',
@@ -2888,14 +2780,8 @@ def _real_extract(self, url):
              r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
              lambda x: unescapeHTML(x.group(0)), webpage)
  
-        # TODO: Remove
-        video_title, video_description, video_thumbnail, age_limit, video_uploader = \
-            info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
-
-        # TODO: Move Embeds
-        self._downloader.write_debug('Looking for single embeds')
-
-        # Look for Brightcove Legacy Studio embeds
+        # TODO: Move to respective extractors
+        self._downloader.write_debug('Looking for Brightcove embeds')
          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
          if bc_urls:
              entries = [{
@@ -2906,853 +2792,17 @@ def _real_extract(self, url):
  
              return {
                  '_type': 'playlist',
-                'title': video_title,
+                'title': info_dict['title'],
                  'id': video_id,
                  'entries': entries,
              }
-
-        # Look for Brightcove New Studio embeds
          bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
          if bc_urls:
              return self.playlist_from_matches(
-                bc_urls, video_id, video_title,
+                bc_urls, video_id, info_dict['title'],
                  getter=lambda x: smuggle_url(x, {'referrer': url}),
                  ie='BrightcoveNew')
  
-        # Look for Nexx embeds
-        nexx_urls = NexxIE._extract_urls(webpage)
-        if nexx_urls:
-            return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
-
-        # Look for Nexx iFrame embeds
-        nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
-        if nexx_embed_urls:
-            return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
-
-        # Look for ThePlatform embeds
-        tp_urls = ThePlatformIE._extract_urls(webpage)
-        if tp_urls:
-            return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
-
-        arc_urls = ArcPublishingIE._extract_urls(webpage)
-        if arc_urls:
-            return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key())
-
-        mychannels_urls = MedialaanIE._extract_urls(webpage)
-        if mychannels_urls:
-            return self.playlist_from_matches(
-                mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key())
-
-        # Look for embedded rtl.nl player
-        matches = re.findall(
-            r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
-            webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
-
-        vimeo_urls = VimeoIE._extract_urls(url, webpage)
-        if vimeo_urls:
-            return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
-
-        vhx_url = VHXEmbedIE._extract_url(url, webpage)
-        if vhx_url:
-            return self.url_result(vhx_url, VHXEmbedIE.ie_key())
-
-        # Invidious Instances
-        # https://github.com/yt-dlp/yt-dlp/issues/195
-        # https://github.com/iv-org/invidious/pull/1730
-        youtube_url = self._search_regex(
-            r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"',
-            webpage, 'youtube link', default=None)
-        if youtube_url:
-            return self.url_result(youtube_url, YoutubeIE.ie_key())
-
-        # Look for YouTube embeds
-        youtube_urls = YoutubeIE._extract_urls(webpage)
-        if youtube_urls:
-            return self.playlist_from_matches(
-                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
-
-        matches = DailymotionIE._extract_urls(webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title)
-
-        # Look for embedded Dailymotion playlist player (#3822)
-        m = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
-        if m:
-            playlists = re.findall(
-                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
-            if playlists:
-                return self.playlist_from_matches(
-                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
-
-        # Look for DailyMail embeds
-        dailymail_urls = DailyMailIE._extract_urls(webpage)
-        if dailymail_urls:
-            return self.playlist_from_matches(
-                dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
-
-        # Look for Teachable embeds, must be before Wistia
-        teachable_url = TeachableIE._extract_url(webpage, url)
-        if teachable_url:
-            return self.url_result(teachable_url)
-
-        # Look for embedded Wistia player
-        wistia_urls = WistiaIE._extract_urls(webpage)
-        if wistia_urls:
-            playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
-            playlist['entries'] = list(playlist['entries'])
-            for entry in playlist['entries']:
-                entry.update({
-                    '_type': 'url_transparent',
-                    'uploader': video_uploader,
-                })
-            return playlist
-
-        # Look for SVT player
-        svt_url = SVTIE._extract_url(webpage)
-        if svt_url:
-            return self.url_result(svt_url, 'SVT')
-
-        # Look for Bandcamp pages with custom domain
-        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
-        if mobj is not None:
-            burl = unescapeHTML(mobj.group(1))
-            # Don't set the extractor because it can be a track url or an album
-            return self.url_result(burl)
-
-        # Check for Substack custom domains
-        substack_url = SubstackIE._extract_url(webpage, url)
-        if substack_url:
-            return self.url_result(substack_url, SubstackIE)
-
-        # Look for embedded Vevo player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
-        # Look for embedded Viddler player
-        mobj = re.search(
-            r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
-            webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
-        # Look for NYTimes player
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
-            webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
-        # Look for Libsyn player
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
-        # Look for Ooyala videos
-        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage)
-                or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)
-                or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage)
-                or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage)
-                or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
-        if mobj is not None:
-            embed_token = self._search_regex(
-                r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)',
-                webpage, 'ooyala embed token', default=None)
-            return OoyalaIE._build_url_result(smuggle_url(
-                mobj.group('ec'), {
-                    'domain': url,
-                    'embed_token': embed_token,
-                }))
-
-        # Look for multiple Ooyala embeds on SBN network websites
-        mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
-        if mobj is not None:
-            embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
-            if embeds:
-                return self.playlist_from_matches(
-                    embeds, video_id, video_title,
-                    getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
-
-        # Look for Aparat videos
-        mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group(1), 'Aparat')
-
-        # Look for MPORA videos
-        mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group(1), 'Mpora')
-
-        # Look for embedded Facebook player
-        facebook_urls = FacebookIE._extract_urls(webpage)
-        if facebook_urls:
-            return self.playlist_from_matches(facebook_urls, video_id, video_title)
-
-        # Look for embedded VK player
-        mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'VK')
-
-        # Look for embedded Odnoklassniki player
-        odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage)
-        if odnoklassniki_url:
-            return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
-
-        # Look for sibnet embedded player
-        sibnet_urls = VKIE._extract_sibnet_urls(webpage)
-        if sibnet_urls:
-            return self.playlist_from_matches(sibnet_urls, video_id, video_title)
-
-        # Look for embedded ivi player
-        mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Ivi')
-
-        # Look for embedded Huffington Post player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'HuffPost')
-
-        # Look for embed.ly
-        mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-        mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
-        if mobj is not None:
-            return self.url_result(urllib.parse.unquote(mobj.group('url')))
-
-        # Look for funnyordie embed
-        matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
-        if matches:
-            return self.playlist_from_matches(
-                matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
-
-        # Look for Simplecast embeds
-        simplecast_urls = SimplecastIE._extract_urls(webpage)
-        if simplecast_urls:
-            return self.playlist_from_matches(
-                simplecast_urls, video_id, video_title)
-
-        # Look for BBC iPlayer embed
-        matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')
-
-        # Look for embedded RUTV player
-        rutv_url = RUTVIE._extract_url(webpage)
-        if rutv_url:
-            return self.url_result(rutv_url, 'RUTV')
-
-        # Look for embedded TVC player
-        tvc_url = TVCIE._extract_url(webpage)
-        if tvc_url:
-            return self.url_result(tvc_url, 'TVC')
-
-        # Look for embedded SportBox player
-        sportbox_urls = SportBoxIE._extract_urls(webpage)
-        if sportbox_urls:
-            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
-
-        # Look for embedded Spotify player
-        spotify_urls = SpotifyBaseIE._extract_urls(webpage)
-        if spotify_urls:
-            return self.playlist_from_matches(spotify_urls, video_id, video_title)
-
-        # Look for embedded XHamster player
-        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
-        if xhamster_urls:
-            return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')
-
-        # Look for embedded TNAFlixNetwork player
-        tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
-        if tnaflix_urls:
-            return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())
-
-        # Look for embedded PornHub player
-        pornhub_urls = PornHubIE._extract_urls(webpage)
-        if pornhub_urls:
-            return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())
-
-        # Look for embedded DrTuber player
-        drtuber_urls = DrTuberIE._extract_urls(webpage)
-        if drtuber_urls:
-            return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())
-
-        # Look for embedded RedTube player
-        redtube_urls = RedTubeIE._extract_urls(webpage)
-        if redtube_urls:
-            return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
-
-        # Look for embedded Tube8 player
-        tube8_urls = Tube8IE._extract_urls(webpage)
-        if tube8_urls:
-            return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key())
-
-        # Look for embedded Mofosex player
-        mofosex_urls = MofosexEmbedIE._extract_urls(webpage)
-        if mofosex_urls:
-            return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key())
-
-        # Look for embedded Spankwire player
-        spankwire_urls = SpankwireIE._extract_urls(webpage)
-        if spankwire_urls:
-            return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key())
-
-        # Look for embedded YouPorn player
-        youporn_urls = YouPornIE._extract_urls(webpage)
-        if youporn_urls:
-            return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key())
-
-        # Look for embedded Tvigle player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Tvigle')
-
-        # Look for embedded TED player
-        ted_urls = TedEmbedIE._extract_urls(webpage)
-        if ted_urls:
-            return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key())
-
-        # Look for embedded Ustream videos
-        ustream_url = UstreamIE._extract_url(webpage)
-        if ustream_url:
-            return self.url_result(ustream_url, UstreamIE.ie_key())
-
-        # Look for embedded arte.tv player
-        arte_urls = ArteTVEmbedIE._extract_urls(webpage)
-        if arte_urls:
-            return self.playlist_from_matches(arte_urls, video_id, video_title)
-
-        # Look for embedded francetv player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
-            webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
-        # Look for embedded Myvi.ru player
-        myvi_url = MyviIE._extract_url(webpage)
-        if myvi_url:
-            return self.url_result(myvi_url)
-
-        # Look for embedded soundcloud player
-        soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage)
-        if soundcloud_urls:
-            return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML)
-
-        # Look for tunein player
-        tunein_urls = TuneInBaseIE._extract_urls(webpage)
-        if tunein_urls:
-            return self.playlist_from_matches(tunein_urls, video_id, video_title)
-
-        # Look for embedded mtvservices player
-        mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
-        if mtvservices_url:
-            return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
-
-        # Look for embedded yahoo player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
-            webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Yahoo')
-
-        # Look for embedded sbs.com.au player
-        mobj = re.search(
-            r'''(?x)
-            (?:
-                <meta\s+property="og:video"\s+content=|
-                <iframe[^>]+?src=
-            )
-            (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
-            webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'SBS')
-
-        # Look for embedded Cinchcast player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
-            webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Cinchcast')
-
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
-            webpage)
-        if not mobj:
-            mobj = re.search(
-                r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)',
-                webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'MLB')
-
-        mobj = re.search(
-            r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
-            webpage)
-        if mobj is not None:
-            return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
-
-        mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
-            webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Livestream')
-
-        # Look for Zapiks embed
-        mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Zapiks')
-
-        # Look for Kaltura embeds
-        kaltura_urls = KalturaIE._extract_urls(webpage)
-        if kaltura_urls:
-            return self.playlist_from_matches(
-                kaltura_urls, video_id, video_title,
-                getter=lambda x: smuggle_url(x, {'source_url': url}),
-                ie=KalturaIE.ie_key())
-
-        # Look for EaglePlatform embeds
-        eagleplatform_url = EaglePlatformIE._extract_url(webpage)
-        if eagleplatform_url:
-            return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
-
-        # Look for ClipYou (uses EaglePlatform) embeds
-        mobj = re.search(
-            r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
-        if mobj is not None:
-            return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
-
-        # Look for Pladform embeds
-        pladform_url = PladformIE._extract_url(webpage)
-        if pladform_url:
-            return self.url_result(pladform_url)
-
-        # Look for Videomore embeds
-        videomore_url = VideomoreIE._extract_url(webpage)
-        if videomore_url:
-            return self.url_result(videomore_url)
-
-        # Look for Webcaster embeds
-        webcaster_url = WebcasterFeedIE._extract_url(self, webpage)
-        if webcaster_url:
-            return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key())
-
-        # Look for Playwire embeds
-        mobj = re.search(
-            r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
-        # Look for Crooks and Liars embeds
-        mobj = re.search(
-            r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'))
-
-        # Look for NBC Sports VPlayer embeds
-        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
-        if nbc_sports_url:
-            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
-
-        # Look for NBC News embeds
-        nbc_news_embed_url = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage)
-        if nbc_news_embed_url:
-            return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews')
-
-        # Look for Google Drive embeds
-        google_drive_url = GoogleDriveIE._extract_url(webpage)
-        if google_drive_url:
-            return self.url_result(google_drive_url, 'GoogleDrive')
-
-        # Look for UDN embeds
-        mobj = re.search(
-            r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
-        if mobj is not None:
-            return self.url_result(
-                urllib.parse.urljoin(url, mobj.group('url')), 'UDNEmbed')
-
-        # Look for Senate ISVP iframe
-        senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
-        if senate_isvp_url:
-            return self.url_result(senate_isvp_url, 'SenateISVP')
-
-        # Look for Kinja embeds
-        kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url)
-        if kinja_embed_urls:
-            return self.playlist_from_matches(
-                kinja_embed_urls, video_id, video_title)
-
-        # Look for OnionStudios embeds
-        onionstudios_url = OnionStudiosIE._extract_url(webpage)
-        if onionstudios_url:
-            return self.url_result(onionstudios_url)
-
-        # Look for Blogger embeds
-        blogger_urls = BloggerIE._extract_urls(webpage)
-        if blogger_urls:
-            return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key())
-
-        # Look for ViewLift embeds
-        viewlift_url = ViewLiftEmbedIE._extract_url(webpage)
-        if viewlift_url:
-            return self.url_result(viewlift_url)
-
-        # Look for JWPlatform embeds
-        jwplatform_urls = JWPlatformIE._extract_urls(webpage)
-        if jwplatform_urls:
-            return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key())
-
-        # Look for Digiteka embeds
-        digiteka_url = DigitekaIE._extract_url(webpage)
-        if digiteka_url:
-            return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key())
-
-        # Look for Arkena embeds
-        arkena_url = ArkenaIE._extract_url(webpage)
-        if arkena_url:
-            return self.url_result(arkena_url, ArkenaIE.ie_key())
-
-        # Look for Piksel embeds
-        piksel_url = PikselIE._extract_url(webpage)
-        if piksel_url:
-            return self.url_result(piksel_url, PikselIE.ie_key())
-
-        # Look for Limelight embeds
-        limelight_urls = LimelightBaseIE._extract_urls(webpage, url)
-        if limelight_urls:
-            return self.playlist_result(
-                limelight_urls, video_id, video_title, video_description)
-
-        # Look for Anvato embeds
-        anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
-        if anvato_urls:
-            return self.playlist_result(
-                anvato_urls, video_id, video_title, video_description)
-
-        # Look for AdobeTVVideo embeds
-        mobj = re.search(
-            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
-            webpage)
-        if mobj is not None:
-            return self.url_result(
-                self._proto_relative_url(unescapeHTML(mobj.group(1))),
-                'AdobeTVVideo')
-
-        # Look for Vine embeds
-        mobj = re.search(
-            r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))',
-            webpage)
-        if mobj is not None:
-            return self.url_result(
-                self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine')
-
-        # Look for VODPlatform embeds
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1',
-            webpage)
-        if mobj is not None:
-            return self.url_result(
-                self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform')
-
-        # Look for Mangomolo embeds
-        mobj = re.search(
-            r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//
-                (?:
-                    admin\.mangomolo\.com/analytics/index\.php/customers/embed|
-                    player\.mangomolo\.com/v1
-                )/
-                (?:
-                    video\?.*?\bid=(?P<video_id>\d+)|
-                    (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)
-                ).+?)\1''', webpage)
-        if mobj is not None:
-            info = {
-                '_type': 'url_transparent',
-                'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))),
-                'title': video_title,
-                'description': video_description,
-                'thumbnail': video_thumbnail,
-                'uploader': video_uploader,
-            }
-            video_id = mobj.group('video_id')
-            if video_id:
-                info.update({
-                    'ie_key': 'MangomoloVideo',
-                    'id': video_id,
-                })
-            else:
-                info.update({
-                    'ie_key': 'MangomoloLive',
-                    'id': mobj.group('channel_id'),
-                })
-            return info
-
-        # Look for Instagram embeds
-        instagram_embed_url = InstagramIE._extract_embed_url(webpage)
-        if instagram_embed_url is not None:
-            return self.url_result(
-                self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
-
-        # Look for 3Q SDN embeds
-        threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
-        if threeqsdn_url:
-            return {
-                '_type': 'url_transparent',
-                'ie_key': ThreeQSDNIE.ie_key(),
-                'url': self._proto_relative_url(threeqsdn_url),
-                'title': video_title,
-                'description': video_description,
-                'thumbnail': video_thumbnail,
-                'uploader': video_uploader,
-            }
-
-        # Look for VBOX7 embeds
-        vbox7_url = Vbox7IE._extract_url(webpage)
-        if vbox7_url:
-            return self.url_result(vbox7_url, Vbox7IE.ie_key())
-
-        # Look for DBTV embeds
-        dbtv_urls = DBTVIE._extract_urls(webpage)
-        if dbtv_urls:
-            return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())
-
-        # Look for Videa embeds
-        videa_urls = VideaIE._extract_urls(webpage)
-        if videa_urls:
-            return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())
-
-        # Look for 20 minuten embeds
-        twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
-        if twentymin_urls:
-            return self.playlist_from_matches(
-                twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
-
-        # Look for VideoPress embeds
-        videopress_urls = VideoPressIE._extract_urls(webpage)
-        if videopress_urls:
-            return self.playlist_from_matches(
-                videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())
-
-        # Look for Rutube embeds
-        rutube_urls = RutubeIE._extract_urls(webpage)
-        if rutube_urls:
-            return self.playlist_from_matches(
-                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
-
-        # Look for Glomex embeds
-        glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url))
-        if glomex_urls:
-            return self.playlist_from_matches(
-                glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key())
-
-        # Look for megatv.com embeds
-        megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage))
-        if megatvcom_urls:
-            return self.playlist_from_matches(
-                megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
-
-        # Look for ant1news.gr embeds
-        ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
-        if ant1newsgr_urls:
-            return self.playlist_from_matches(
-                ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key())
-
-        # Look for WashingtonPost embeds
-        wapo_urls = WashingtonPostIE._extract_urls(webpage)
-        if wapo_urls:
-            return self.playlist_from_matches(
-                wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
-
-        # Look for Mediaset embeds
-        mediaset_urls = MediasetIE._extract_urls(self, webpage)
-        if mediaset_urls:
-            return self.playlist_from_matches(
-                mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
-
-        # Look for JOJ.sk embeds
-        joj_urls = JojIE._extract_urls(webpage)
-        if joj_urls:
-            return self.playlist_from_matches(
-                joj_urls, video_id, video_title, ie=JojIE.ie_key())
-
-        # Look for megaphone.fm embeds
-        mpfn_urls = MegaphoneIE._extract_urls(webpage)
-        if mpfn_urls:
-            return self.playlist_from_matches(
-                mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
-
-        # Look for vzaar embeds
-        vzaar_urls = VzaarIE._extract_urls(webpage)
-        if vzaar_urls:
-            return self.playlist_from_matches(
-                vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
-
-        channel9_urls = Channel9IE._extract_urls(webpage)
-        if channel9_urls:
-            return self.playlist_from_matches(
-                channel9_urls, video_id, video_title, ie=Channel9IE.ie_key())
-
-        vshare_urls = VShareIE._extract_urls(webpage)
-        if vshare_urls:
-            return self.playlist_from_matches(
-                vshare_urls, video_id, video_title, ie=VShareIE.ie_key())
-
-        # Look for Mediasite embeds
-        mediasite_urls = MediasiteIE._extract_urls(webpage)
-        if mediasite_urls:
-            entries = [
-                self.url_result(smuggle_url(
-                    urllib.parse.urljoin(url, mediasite_url),
-                    {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
-                for mediasite_url in mediasite_urls]
-            return self.playlist_result(entries, video_id, video_title)
-
-        springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage)
-        if springboardplatform_urls:
-            return self.playlist_from_matches(
-                springboardplatform_urls, video_id, video_title,
-                ie=SpringboardPlatformIE.ie_key())
-
-        yapfiles_urls = YapFilesIE._extract_urls(webpage)
-        if yapfiles_urls:
-            return self.playlist_from_matches(
-                yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key())
-
-        vice_urls = ViceIE._extract_urls(webpage)
-        if vice_urls:
-            return self.playlist_from_matches(
-                vice_urls, video_id, video_title, ie=ViceIE.ie_key())
-
-        xfileshare_urls = XFileShareIE._extract_urls(webpage)
-        if xfileshare_urls:
-            return self.playlist_from_matches(
-                xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key())
-
-        cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage)
-        if cloudflarestream_urls:
-            return self.playlist_from_matches(
-                cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key())
-
-        peertube_urls = PeerTubeIE._extract_urls(webpage, url)
-        if peertube_urls:
-            return self.playlist_from_matches(
-                peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
-
-        indavideo_urls = IndavideoEmbedIE._extract_urls(webpage)
-        if indavideo_urls:
-            return self.playlist_from_matches(
-                indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key())
-
-        apa_urls = APAIE._extract_urls(webpage)
-        if apa_urls:
-            return self.playlist_from_matches(
-                apa_urls, video_id, video_title, ie=APAIE.ie_key())
-
-        foxnews_urls = FoxNewsIE._extract_urls(webpage)
-        if foxnews_urls:
-            return self.playlist_from_matches(
-                foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key())
-
-        sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer(
-            r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1',
-            webpage)]
-        if sharevideos_urls:
-            return self.playlist_from_matches(
-                sharevideos_urls, video_id, video_title)
-
-        viqeo_urls = ViqeoIE._extract_urls(webpage)
-        if viqeo_urls:
-            return self.playlist_from_matches(
-                viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key())
-
-        expressen_urls = ExpressenIE._extract_urls(webpage)
-        if expressen_urls:
-            return self.playlist_from_matches(
-                expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key())
-
-        zype_urls = ZypeIE._extract_urls(webpage)
-        if zype_urls:
-            return self.playlist_from_matches(
-                zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
-
-        gedi_urls = GediDigitalIE._extract_urls(webpage)
-        if gedi_urls:
-            return self.playlist_from_matches(
-                gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key())
-
-        # Look for RCS media group embeds
-        rcs_urls = RCSEmbedsIE._extract_urls(webpage)
-        if rcs_urls:
-            return self.playlist_from_matches(
-                rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key())
-
-        wimtv_urls = WimTVIE._extract_urls(webpage)
-        if wimtv_urls:
-            return self.playlist_from_matches(
-                wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key())
-
-        bitchute_urls = BitChuteIE._extract_urls(webpage)
-        if bitchute_urls:
-            return self.playlist_from_matches(
-                bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key())
-
-        rumble_urls = RumbleEmbedIE._extract_urls(webpage)
-        if len(rumble_urls) == 1:
-            return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key())
-        if rumble_urls:
-            return self.playlist_from_matches(
-                rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
-
-        # Look for (tvopen|ethnos).gr embeds
-        tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage))
-        if tvopengr_urls:
-            return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key())
-
-        # Look for ert.gr webtv embeds
-        ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage))
-        if len(ertwebtv_urls) == 1:
-            return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True)
-        elif ertwebtv_urls:
-            return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key())
-
-        tvp_urls = TVPEmbedIE._extract_urls(webpage)
-        if tvp_urls:
-            return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())
-
-        # Look for MainStreaming embeds
-        mainstreaming_urls = MainStreamingIE._extract_urls(webpage)
-        if mainstreaming_urls:
-            return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key())
-
-        # Look for Gfycat Embeds
-        gfycat_urls = GfycatIE._extract_urls(webpage)
-        if gfycat_urls:
-            return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key())
-
-        panopto_urls = PanoptoBaseIE._extract_urls(webpage)
-        if panopto_urls:
-            return self.playlist_from_matches(panopto_urls, video_id, video_title)
-
-        # Look for Ruutu embeds
-        ruutu_urls = RuutuIE._extract_urls(webpage)
-        if ruutu_urls:
-            return self.playlist_from_matches(ruutu_urls, video_id, video_title)
-
-        # Look for Tiktok embeds
-        tiktok_urls = TikTokIE._extract_urls(webpage)
-        if tiktok_urls:
-            return self.playlist_from_matches(tiktok_urls, video_id, video_title)
-        # TODO: END: Move Embeds
-
          self._downloader.write_debug('Looking for embeds')
          embeds = []
          for ie in gen_extractor_classes():
@@ -3784,7 +2834,7 @@ def _real_extract(self, url):
                  return {
                      **info_dict,
                      '_type': 'url',
-                    'ie_key': JWPlatformIE.ie_key(),
+                    'ie_key': 'JWPlatform',
                      'url': jwplayer_data['playlist'],
                  }
              try:
@@ -4045,9 +3095,9 @@ def filter_video(urls):
  
              entry_info_dict = {
                  'id': video_id,
-                'uploader': video_uploader,
-                'title': video_title,
-                'age_limit': age_limit,
+                'uploader': domain_name,
+                'title': info_dict['title'],
+                'age_limit': info_dict['age_limit'],
                  'http_headers': headers,
              }
  
diff --git a/yt_dlp/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py

index 60f06ccd78cedbe172fae424a013691ca4123ed6..9d091c113a3d1f314e7fc6176086dd7d14b8880e 100644 (file)
--- a/yt_dlp/extractor/gfycat.py
+++ b/yt_dlp/extractor/gfycat.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      int_or_none,
@@ -11,6 +9,7 @@
  
  class GfycatIE(InfoExtractor):
      _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)'
+    _EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
      _TESTS = [{
          'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
          'info_dict': {
@@ -82,14 +81,6 @@ class GfycatIE(InfoExtractor):
          'only_matching': True
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL,
-                webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/glomex.py b/yt_dlp/extractor/glomex.py

index 85ffa4c0524f7237903da1182ab5ee0d88bc4125..86fe1b0243e012b78eef9396714f5d4679dbcda9 100644 (file)
--- a/yt_dlp/extractor/glomex.py
+++ b/yt_dlp/extractor/glomex.py
@@ -174,7 +174,7 @@ def build_player_url(cls, video_id, integration, origin_url=None):
          return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
  
      @classmethod
-    def _extract_urls(cls, webpage, origin_url):
+    def _extract_embed_urls(cls, url, webpage):
          # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
          quot_re = r'["\']'
  
@@ -183,9 +183,9 @@ def _extract_urls(cls, webpage, origin_url):
                  (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
              )(?P=q)'''
          for mobj in re.finditer(regex, webpage):
-            url = unescapeHTML(mobj.group('url'))
-            if cls.suitable(url):
-                yield cls._smuggle_origin_url(url, origin_url)
+            embed_url = unescapeHTML(mobj.group('url'))
+            if cls.suitable(embed_url):
+                yield cls._smuggle_origin_url(embed_url, url)
  
          regex = fr'''(?x)
              <glomex-player [^>]+?>|
@@ -193,7 +193,7 @@ def _extract_urls(cls, webpage, origin_url):
          for mobj in re.finditer(regex, webpage):
              attrs = extract_attributes(mobj.group(0))
              if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
-                yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url)
+                yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url)
  
          # naive parsing of inline scripts for hard-coded integration parameters
          regex = fr'''(?x)
@@ -206,7 +206,7 @@ def _extract_urls(cls, webpage, origin_url):
                  continue
              playlist_id = re.search(regex % 'playlistId', script)
              if playlist_id:
-                yield cls.build_player_url(playlist_id, integration_id, origin_url)
+                yield cls.build_player_url(playlist_id, integration_id, url)
  
      def _real_extract(self, url):
          url, origin_url = self._unsmuggle_origin_url(url)
diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py

index d7475b6da76428161111d86cc75b7f3cd085f1de..cb123b874d28547dcab31cd926cd39e79da0c178 100644 (file)
--- a/yt_dlp/extractor/googledrive.py
+++ b/yt_dlp/extractor/googledrive.py
@@ -77,13 +77,13 @@ class GoogleDriveIE(InfoExtractor):
      _caption_formats_ext = []
      _captions_xml = None
  
-    @staticmethod
-    def _extract_url(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          mobj = re.search(
              r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
              webpage)
          if mobj:
-            return 'https://drive.google.com/file/d/%s' % mobj.group('id')
+            yield 'https://drive.google.com/file/d/%s' % mobj.group('id')
  
      def _download_subtitles_xml(self, video_id, subtitles_id, hl):
          if self._captions_xml:
diff --git a/yt_dlp/extractor/heise.py b/yt_dlp/extractor/heise.py

index 84e5d3023a504395d047d1836efb54537f852845..a80eaaf81d596ca9f6d6b32d25d3c97e33615e52 100644 (file)
--- a/yt_dlp/extractor/heise.py
+++ b/yt_dlp/extractor/heise.py
@@ -121,7 +121,7 @@ def _make_kaltura_result(kaltura_url):
          if kaltura_id:
              return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id)
  
-        yt_urls = YoutubeIE._extract_urls(webpage)
+        yt_urls = YoutubeIE._extract_embed_urls(url, webpage)
          if yt_urls:
              return self.playlist_from_matches(
                  yt_urls, video_id, title, ie=YoutubeIE.ie_key())
diff --git a/yt_dlp/extractor/huffpost.py b/yt_dlp/extractor/huffpost.py

index 7286dbcd71719de7dda101a0d353c9c171a4d6c7..27ebc8b6c2ce022ade6569c332c7674d2daca2db 100644 (file)
--- a/yt_dlp/extractor/huffpost.py
+++ b/yt_dlp/extractor/huffpost.py
@@ -17,6 +17,7 @@ class HuffPostIE(InfoExtractor):
              HPLEmbedPlayer/\?segmentId=
          )
          (?P<id>[0-9a-f]+)'''
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1']
  
      _TEST = {
          'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
diff --git a/yt_dlp/extractor/indavideo.py b/yt_dlp/extractor/indavideo.py

index fb041a1821de3be8201bd27af80571c1eff1bf98..b397c168c4a2653b809d71a2d5f2a7b262331a76 100644 (file)
--- a/yt_dlp/extractor/indavideo.py
+++ b/yt_dlp/extractor/indavideo.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import compat_str
  from ..utils import (
@@ -12,6 +10,14 @@
  
  class IndavideoEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)'
+    # Some example URLs covered by generic extractor:
+    #   http://indavideo.hu/video/Vicces_cica_1
+    #   http://index.indavideo.hu/video/2015_0728_beregszasz
+    #   http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
+    #   http://erotika.indavideo.hu/video/Amator_tini_punci
+    #   http://film.indavideo.hu/video/f_hrom_nagymamm_volt
+    #   http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)']
      _TESTS = [{
          'url': 'http://indavideo.hu/player/video/1bdc3c6d80/',
          'md5': 'c8a507a1c7410685f83a06eaeeaafeab',
@@ -37,20 +43,6 @@ class IndavideoEmbedIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    # Some example URLs covered by generic extractor:
-    #   http://indavideo.hu/video/Vicces_cica_1
-    #   http://index.indavideo.hu/video/2015_0728_beregszasz
-    #   http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
-    #   http://erotika.indavideo.hu/video/Amator_tini_punci
-    #   http://film.indavideo.hu/video/f_hrom_nagymamm_volt
-    #   http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
-
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)',
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py

index 04afacb9049cc9e4dd2f957b3895a2bad091406f..94db756403add923bcad616caf5bfb06795491be 100644 (file)
--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@@ -243,6 +243,7 @@ def _real_extract(self, url):
  
  class InstagramIE(InstagramBaseIE):
      _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1']
      _TESTS = [{
          'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
          'md5': '0d2da106a9d2631273e192b372806516',
@@ -346,23 +347,16 @@ class InstagramIE(InstagramBaseIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_embed_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
-        blockquote_el = get_element_by_attribute(
-            'class', 'instagram-media', webpage)
-        if blockquote_el is None:
-            return
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        res = tuple(super()._extract_embed_urls(url, webpage))
+        if res:
+            return res
  
-        mobj = re.search(
-            r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
+        mobj = re.search(r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1',
+                         get_element_by_attribute('class', 'instagram-media', webpage) or '')
          if mobj:
-            return mobj.group('link')
+            return [mobj.group('link')]
  
      def _real_extract(self, url):
          video_id, url = self._match_valid_url(url).group('id', 'url')
diff --git a/yt_dlp/extractor/ivi.py b/yt_dlp/extractor/ivi.py

index 69974694383efa3309f3a50076c963dab7f569d4..6772fcbb9670363b060f14b5fd731196a06f9c9d 100644 (file)
--- a/yt_dlp/extractor/ivi.py
+++ b/yt_dlp/extractor/ivi.py
@@ -13,6 +13,7 @@ class IviIE(InfoExtractor):
      IE_DESC = 'ivi.ru'
      IE_NAME = 'ivi'
      _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)'
+    _EMBED_REGEX = [r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1']
      _GEO_BYPASS = False
      _GEO_COUNTRIES = ['RU']
      _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c'
diff --git a/yt_dlp/extractor/joj.py b/yt_dlp/extractor/joj.py

index 1c4676e95a1de5936f6b8382b6bdae4774211922..298b3782321455ad9005a963d447a90bd454c032 100644 (file)
--- a/yt_dlp/extractor/joj.py
+++ b/yt_dlp/extractor/joj.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import compat_str
  from ..utils import (
@@ -18,6 +16,7 @@ class JojIE(InfoExtractor):
                      )
                      (?P<id>[^/?#^]+)
                  '''
+    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1']
      _TESTS = [{
          'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
          'info_dict': {
@@ -38,14 +37,6 @@ class JojIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
-                webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py

index 2cb7ca3d7a6013b98027be6b345e19da566fe30a..d6b8420a87f4fd3a3e359c75c89ee03f8ce21f04 100644 (file)
--- a/yt_dlp/extractor/jwplatform.py
+++ b/yt_dlp/extractor/jwplatform.py
@@ -22,13 +22,8 @@ class JWPlatformIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        urls = JWPlatformIE._extract_urls(webpage)
-        return urls[0] if urls else None
-
-    @staticmethod
-    def _extract_urls(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')):
              # <input value=URL> is used by hyland.com
              # if we find <iframe>, dont look for <input>
diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py

index f4092aa7178fd13243a95484583b236ca2418fe1..f62c9791c55730cddc4bdae6c477139ca7c70919 100644 (file)
--- a/yt_dlp/extractor/kaltura.py
+++ b/yt_dlp/extractor/kaltura.py
@@ -111,13 +111,8 @@ class KalturaIE(InfoExtractor):
          }
      ]
  
-    @staticmethod
-    def _extract_url(webpage):
-        urls = KalturaIE._extract_urls(webpage)
-        return urls[0] if urls else None
-
-    @staticmethod
-    def _extract_urls(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
          finditer = (
              list(re.finditer(
@@ -159,14 +154,14 @@ def _extract_urls(webpage):
              for k, v in embed_info.items():
                  if v:
                      embed_info[k] = v.strip()
-            url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
+            embed_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
              escaped_pid = re.escape(embed_info['partner_id'])
              service_mobj = re.search(
                  r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
                  webpage)
              if service_mobj:
-                url = smuggle_url(url, {'service_url': service_mobj.group('id')})
-            urls.append(url)
+                embed_url = smuggle_url(embed_url, {'service_url': service_mobj.group('id')})
+            urls.append(embed_url)
          return urls
  
      def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):
diff --git a/yt_dlp/extractor/kinja.py b/yt_dlp/extractor/kinja.py

index c00abfbc11ca2da4adcafc7da5a8892fd1da414e..3747d8eeaa6a0236095a32cef3a749899c45bde8 100644 (file)
--- a/yt_dlp/extractor/kinja.py
+++ b/yt_dlp/extractor/kinja.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import (
      compat_str,
@@ -10,8 +8,6 @@
      parse_iso8601,
      strip_or_none,
      try_get,
-    unescapeHTML,
-    urljoin,
  )
  
  
@@ -55,6 +51,7 @@ class KinjaEmbedIE(InfoExtractor):
              vine|
              youtube-(?:list|video)
          )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX)
+    _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//{_DOMAIN_REGEX})?{_COMMON_REGEX}(?:(?!\1).)+)\1']
      _TESTS = [{
          'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621',
          'only_matching': True,
@@ -119,12 +116,6 @@ class KinjaEmbedIE(InfoExtractor):
          'youtube-video': ('youtube.com/embed/', 'Youtube'),
      }
  
-    @staticmethod
-    def _extract_urls(webpage, url):
-        return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer(
-            r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX),
-            webpage)]
-
      def _real_extract(self, url):
          video_type, video_id = self._match_valid_url(url).groups()
  
diff --git a/yt_dlp/extractor/libsyn.py b/yt_dlp/extractor/libsyn.py

index 8245a34817ec436c7186cbbabe812bb0efa1ee64..29bbb03defdd67fee2bef63702e2cb52dd555e20 100644 (file)
--- a/yt_dlp/extractor/libsyn.py
+++ b/yt_dlp/extractor/libsyn.py
@@ -10,6 +10,7 @@
  
  class LibsynIE(InfoExtractor):
      _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1']
  
      _TESTS = [{
          'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/',
diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py

index 25667fc0766b29a0cb348c8f54c145864f5f68cd..90065094b60006641d72bffdc28f6ffea8d005b3 100644 (file)
--- a/yt_dlp/extractor/limelight.py
+++ b/yt_dlp/extractor/limelight.py
@@ -17,7 +17,7 @@ class LimelightBaseIE(InfoExtractor):
      _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
  
      @classmethod
-    def _extract_urls(cls, webpage, source_url):
+    def _extract_embed_urls(cls, url, webpage):
          lm = {
              'Media': 'media',
              'Channel': 'channel',
@@ -25,7 +25,7 @@ def _extract_urls(cls, webpage, source_url):
          }
  
          def smuggle(url):
-            return smuggle_url(url, {'source_url': source_url})
+            return smuggle_url(url, {'source_url': url})
  
          entries = []
          for kind, video_id in re.findall(
diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py

index 4b90c22c56646588c8cbe8ed12bdc5d382b22a46..70449dce59607ca8b0ee430003c58733e7be0204 100644 (file)
--- a/yt_dlp/extractor/livestream.py
+++ b/yt_dlp/extractor/livestream.py
@@ -23,6 +23,8 @@
  class LivestreamIE(InfoExtractor):
      IE_NAME = 'livestream'
      _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?'
+    _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"']
+
      _TESTS = [{
          'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
          'md5': '53274c76ba7754fb0e8d072716f2292b',
diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py

index c144c75928b118ab1420c2b67aa97c7faa682dc3..213a1df5723e9655e05186fddc2213a92d1e0796 100644 (file)
--- a/yt_dlp/extractor/mainstreaming.py
+++ b/yt_dlp/extractor/mainstreaming.py
@@ -14,6 +14,7 @@
  
  class MainStreamingIE(InfoExtractor):
      _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
+    _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?']
      IE_DESC = 'MainStreaming Player'
  
      _TESTS = [
@@ -102,13 +103,6 @@ class MainStreamingIE(InfoExtractor):
          }
      ]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        mobj = re.findall(
-            r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
-        if mobj:
-            return [group[0] for group in mobj]
-
      def _playlist_entries(self, host, playlist_content):
          for entry in playlist_content:
              content_id = entry.get('contentID')
diff --git a/yt_dlp/extractor/mangomolo.py b/yt_dlp/extractor/mangomolo.py

index a392e9b54290dc2855849bd331974beb9a397ae0..568831aa8a20d12a47dbd649ee735f3bf258c379 100644 (file)
--- a/yt_dlp/extractor/mangomolo.py
+++ b/yt_dlp/extractor/mangomolo.py
@@ -3,11 +3,29 @@
      compat_b64decode,
      compat_urllib_parse_unquote,
  )
-from ..utils import int_or_none
+from ..utils import classproperty, int_or_none
  
  
  class MangomoloBaseIE(InfoExtractor):
-    _BASE_REGEX = r'https?://(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)'
+    _BASE_REGEX = r'(?:https?:)?//(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)'
+    _SLUG = None
+
+    @classproperty
+    def _VALID_URL(cls):
+        return f'{cls._BASE_REGEX}{cls._SLUG}'
+
+    @classproperty
+    def _EMBED_REGEX(cls):
+        return [rf'<iframe[^>]+src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1']
+
+    def _extract_from_webpage(self, url, webpage):
+        for res in super()._extract_from_webpage(url, webpage):
+            yield {
+                **res,
+                '_type': 'url_transparent',
+                'id': self._search_regex(self._SLUG, res['url'], 'id', group='id'),
+                'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'),
+            }
  
      def _get_real_id(self, page_id):
          return page_id
@@ -41,14 +59,15 @@ def _real_extract(self, url):
  class MangomoloVideoIE(MangomoloBaseIE):
      _TYPE = 'video'
      IE_NAME = 'mangomolo:' + _TYPE
-    _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'video\?.*?\bid=(?P<id>\d+)'
+    _SLUG = r'video\?.*?\bid=(?P<id>\d+)'
+
      _IS_LIVE = False
  
  
  class MangomoloLiveIE(MangomoloBaseIE):
      _TYPE = 'live'
      IE_NAME = 'mangomolo:' + _TYPE
-    _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'(live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)'
+    _SLUG = r'(?:live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)'
      _IS_LIVE = True
  
      def _get_real_id(self, page_id):
diff --git a/yt_dlp/extractor/medialaan.py b/yt_dlp/extractor/medialaan.py

index 297f8c4b25ab6583e4978d35e97baec355586161..6daa50846644657977c572a335d829ade7321dcc 100644 (file)
--- a/yt_dlp/extractor/medialaan.py
+++ b/yt_dlp/extractor/medialaan.py
@@ -69,8 +69,8 @@ class MedialaanIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          entries = []
          for element in re.findall(r'(<div[^>]+data-mychannels-type="video"[^>]*>)', webpage):
              mychannels_id = extract_attributes(element).get('data-mychannels-id')
diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py

index f396c1bd339d138d05d79b8af124ba877799cbc8..4e549fe5e2277a4930ea5eacdb87a57e7cdcc952 100644 (file)
--- a/yt_dlp/extractor/mediaset.py
+++ b/yt_dlp/extractor/mediaset.py
@@ -167,8 +167,7 @@ class MediasetIE(ThePlatformBaseIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(ie, webpage):
+    def _extract_from_webpage(self, url, webpage):
          def _qs(url):
              return parse_qs(url)
  
@@ -188,8 +187,7 @@ def _program_guid(qs):
              video_id = embed_qs.get('id', [None])[0]
              if not video_id:
                  continue
-            urlh = ie._request_webpage(
-                embed_url, video_id, note='Following embed URL redirect')
+            urlh = self._request_webpage(embed_url, video_id, note='Following embed URL redirect')
              embed_url = urlh.geturl()
              program_guid = _program_guid(_qs(embed_url))
              if program_guid:
diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py

index 30464bad0e745880ac2176fdc38704eb20308c70..0ffd01cd294ef7b70634d106564b743fdeb5a9f7 100644 (file)
--- a/yt_dlp/extractor/mediasite.py
+++ b/yt_dlp/extractor/mediasite.py
@@ -13,7 +13,7 @@
      str_or_none,
      try_call,
      try_get,
-    unescapeHTML,
+    smuggle_url,
      unsmuggle_url,
      url_or_none,
      urljoin,
@@ -25,6 +25,7 @@
  
  class MediasiteIE(InfoExtractor):
      _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE
+    _EMBED_REGEX = [r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE]
      _TESTS = [
          {
              'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
@@ -112,13 +113,10 @@ class MediasiteIE(InfoExtractor):
          5: 'video3',
      }
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            unescapeHTML(mobj.group('url'))
-            for mobj in re.finditer(
-                r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE,
-                webpage)]
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield smuggle_url(embed_url, {'UrlReferrer': url})
  
      def __extract_slides(self, *, stream_id, snum, Stream, duration, images):
          slide_base_url = Stream['SlideBaseUrl']
diff --git a/yt_dlp/extractor/megaphone.py b/yt_dlp/extractor/megaphone.py

index 0c150ef45c377dab2c68413c6861565649c5bef1..af80523e3275a06aa4f750a12ce960e0ce27171a 100644 (file)
--- a/yt_dlp/extractor/megaphone.py
+++ b/yt_dlp/extractor/megaphone.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import js_to_json
  
@@ -8,6 +6,7 @@ class MegaphoneIE(InfoExtractor):
      IE_NAME = 'megaphone.fm'
      IE_DESC = 'megaphone.fm embedded players'
      _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)'
+    _EMBED_REGEX = [rf'<iframe[^>]*?\ssrc=["\'](?P<url>{_VALID_URL})']
      _TEST = {
          'url': 'https://player.megaphone.fm/GLT9749789991?"',
          'md5': '4816a0de523eb3e972dc0dda2c191f96',
@@ -45,8 +44,3 @@ def _real_extract(self, url):
              'duration': episode_data['duration'],
              'formats': formats,
          }
-
-    @classmethod
-    def _extract_urls(cls, webpage):
-        return [m[0] for m in re.findall(
-            r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)]
diff --git a/yt_dlp/extractor/megatvcom.py b/yt_dlp/extractor/megatvcom.py

index ec481d016caa6cf1abc04bcb1c08aefb55403306..54c7b7f9fe2f3df7e1f8e18ff6ee799e9d0995c7 100644 (file)
--- a/yt_dlp/extractor/megatvcom.py
+++ b/yt_dlp/extractor/megatvcom.py
@@ -104,7 +104,7 @@ class MegaTVComEmbedIE(MegaTVComBaseIE):
      IE_NAME = 'megatvcom:embed'
      IE_DESC = 'megatv.com embedded videos'
      _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)'
-    _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
+    _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''']
  
      _TESTS = [{
          'url': 'https://www.megatv.com/embed/?p=2020520979',
@@ -134,11 +134,6 @@ class MegaTVComEmbedIE(MegaTVComBaseIE):
          },
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        for mobj in cls._EMBED_RE.finditer(webpage):
-            yield unescapeHTML(mobj.group('url'))
-
      def _match_canonical_url(self, webpage):
          LINK_RE = r'''(?x)
          <link(?:
diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py

index 5fb97083a092815504b7f6573d93ca8d92206011..dd1f54f871bc1f164ac2dfc57ddcdb2b0e515e2d 100644 (file)
--- a/yt_dlp/extractor/mlb.py
+++ b/yt_dlp/extractor/mlb.py
@@ -92,6 +92,10 @@ class MLBIE(MLBBaseIE):
                              (?P<id>\d+)
                          )
                      '''
+    _EMBED_REGEX = [
+        r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+        r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)',
+    ]
      _TESTS = [
          {
              'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933',
diff --git a/yt_dlp/extractor/mofosex.py b/yt_dlp/extractor/mofosex.py

index 66a098c9766441e9853fa5202e914f97e91ad93f..4221ef3e36eb5eb5b82d6ee39614600d84f5a364 100644 (file)
--- a/yt_dlp/extractor/mofosex.py
+++ b/yt_dlp/extractor/mofosex.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      int_or_none,
@@ -59,17 +57,12 @@ def _real_extract(self, url):
  
  class MofosexEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)']
      _TESTS = [{
          'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM',
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)',
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          return self.url_result(
diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py

index d161c33c19c97484b0ca5bfe86dbb9a5aa304a6a..10cd304eb0b011ab02376e2fdd424e48441e9ed5 100644 (file)
--- a/yt_dlp/extractor/mtv.py
+++ b/yt_dlp/extractor/mtv.py
@@ -331,6 +331,7 @@ def _real_extract(self, url):
  class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
      IE_NAME = 'mtvservices:embedded'
      _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1']
  
      _TEST = {
          # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
@@ -346,13 +347,6 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
          },
      }
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _get_feed_url(self, uri, url=None):
          video_id = self._id_from_uri(uri)
          config = self._download_json(
diff --git a/yt_dlp/extractor/myvi.py b/yt_dlp/extractor/myvi.py

index b31cf4493ace8b9754b2dc0fa39f5f4c9a6cb324..df7200be20b065b074ddb8f314350e363bbee926 100644 (file)
--- a/yt_dlp/extractor/myvi.py
+++ b/yt_dlp/extractor/myvi.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from .vimple import SprutoBaseIE
  
@@ -26,6 +24,7 @@ class MyviIE(SprutoBaseIE):
                          )
                          (?P<id>[\da-zA-Z_-]+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1']
      _TESTS = [{
          'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
          'md5': '571bbdfba9f9ed229dc6d34cc0f335bf',
@@ -56,13 +55,6 @@ class MyviIE(SprutoBaseIE):
          'only_matching': True,
      }]
  
-    @classmethod
-    def _extract_url(cls, webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py

index 365c2e60d58fe2b583b8922071e2751f4d2a5427..910cbedf67dd34a6675ba5c0bc34826a9133c837 100644 (file)
--- a/yt_dlp/extractor/nbc.py
+++ b/yt_dlp/extractor/nbc.py
@@ -184,6 +184,7 @@ def _real_extract(self, url):
  class NBCSportsVPlayerIE(InfoExtractor):
      _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/'
      _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
+    _EMBED_REGEX = [r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % _VALID_URL_BASE]
  
      _TESTS = [{
          'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI',
@@ -207,13 +208,6 @@ class NBCSportsVPlayerIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        video_urls = re.search(
-            r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage)
-        if video_urls:
-            return video_urls.group('url')
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
@@ -317,6 +311,7 @@ def _real_extract(self, url):
  
  class NBCNewsIE(ThePlatformIE):
      _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1']
  
      _TESTS = [
          {
diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py

index 01376be3da4325c25bcb92a703e6050455560ea6..69c48652cad038b27cf5ecb875c464b1c83c668d 100644 (file)
--- a/yt_dlp/extractor/nexx.py
+++ b/yt_dlp/extractor/nexx.py
@@ -114,8 +114,8 @@ def _extract_domain_id(webpage):
              webpage)
          return mobj.group('id') if mobj else None
  
-    @staticmethod
-    def _extract_urls(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
          # Reference:
          # 1. https://nx-s.akamaized.net/files/201510/44.pdf
  
@@ -135,10 +135,6 @@ def _extract_urls(webpage):
  
          return entries
  
-    @staticmethod
-    def _extract_url(webpage):
-        return NexxIE._extract_urls(webpage)[0]
-
      def _handle_error(self, response):
          if traverse_obj(response, ('metadata', 'notice'), expected_type=str):
              self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice']))
@@ -498,6 +494,8 @@ def find_video(result):
  
  class NexxEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)'
+    # Reference. https://nx-s.akamaized.net/files/201510/44.pdf
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1']
      _TESTS = [{
          'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
          'md5': '16746bfc28c42049492385c989b26c4a',
@@ -521,16 +519,6 @@ class NexxEmbedIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        # Reference:
-        # 1. https://nx-s.akamaized.net/files/201510/44.pdf
-
-        # iFrame Embed Integration
-        return [mobj.group('url') for mobj in re.finditer(
-            r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1',
-            webpage)]
-
      def _real_extract(self, url):
          embed_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py

index f388688c4c4f916775bff69da56c19138006d6c8..fe6986a8277257c8a9e2a8b22ea863e30c16313e 100644 (file)
--- a/yt_dlp/extractor/nytimes.py
+++ b/yt_dlp/extractor/nytimes.py
@@ -103,6 +103,7 @@ def get_file_size(file_size):
  
  class NYTimesIE(NYTimesBaseIE):
      _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>']
  
      _TESTS = [{
          'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py

index 36a7f5f4e7a70d5780cb5654299e87760b8c5f2b..4faec914edb7d34c93012d1662deac954a4d3e96 100644 (file)
--- a/yt_dlp/extractor/odnoklassniki.py
+++ b/yt_dlp/extractor/odnoklassniki.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import (
      compat_etree_fromstring,
@@ -31,6 +29,7 @@ class OdnoklassnikiIE(InfoExtractor):
                      )
                      (?P<id>[\d-]+)
                  '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
      _TESTS = [{
          'note': 'Coub embedded',
          'url': 'http://ok.ru/video/1484130554189',
@@ -161,13 +160,6 @@ class OdnoklassnikiIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          try:
              return self._extract_desktop(url)
diff --git a/yt_dlp/extractor/onionstudios.py b/yt_dlp/extractor/onionstudios.py

index 9776b4d975fd451857af5df51e46cc36bda8d19e..5fa49e142300063e17dbfb8b82a42aea6e63b62e 100644 (file)
--- a/yt_dlp/extractor/onionstudios.py
+++ b/yt_dlp/extractor/onionstudios.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import compat_str
  from ..utils import js_to_json
@@ -7,6 +5,7 @@
  
  class OnionStudiosIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:video(?:s/[^/]+-|/)|embed\?.*\bid=)(?P<id>\d+)(?!-)'
+    _EMBED_REGEX = [r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1']
  
      _TESTS = [{
          'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
@@ -29,13 +28,6 @@ class OnionStudiosIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/ooyala.py b/yt_dlp/extractor/ooyala.py

index 77017f08b9fa5fb980265063dfc72f78528a7270..146c1f981a46d3f2acb7a58efe77e12d604cfa9c 100644 (file)
--- a/yt_dlp/extractor/ooyala.py
+++ b/yt_dlp/extractor/ooyala.py
@@ -10,6 +10,7 @@
      determine_ext,
      float_or_none,
      int_or_none,
+    smuggle_url,
      try_get,
      unsmuggle_url,
  )
@@ -151,6 +152,29 @@ class OoyalaIE(OoyalaBaseIE):
          }
      ]
  
+    def _extract_from_webpage(self, url, webpage):
+        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage)
+                or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+                or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+                or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage)
+                or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
+        if mobj is not None:
+            embed_token = self._search_regex(
+                r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)',
+                webpage, 'ooyala embed token', default=None)
+            yield self._build_url_result(smuggle_url(
+                mobj.group('ec'), {
+                    'domain': url,
+                    'embed_token': embed_token,
+                }))
+            return
+
+        # Look for multiple Ooyala embeds on SBN network websites
+        mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
+        if mobj is not None:
+            for v in self._parse_json(mobj.group(1), self._generic_id(url), fatal=False) or []:
+                yield self._build_url_result(smuggle_url(v['provider_video_id'], {'domain': url}))
+
      @staticmethod
      def _url_for_embed_code(embed_code):
          return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py

index 3388f7f393f90d0a8d43a51465204b46cff6607d..5f5edb26b2cb3b4e9759c7274af6ee26208bbc4b 100644 (file)
--- a/yt_dlp/extractor/panopto.py
+++ b/yt_dlp/extractor/panopto.py
@@ -1,4 +1,3 @@
-import re
  import calendar
  import json
  import functools
@@ -73,15 +72,10 @@ def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs):
      def _parse_fragment(url):
          return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()}
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [m.group('url') for m in re.finditer(
-            r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE,
-            webpage)]
-
  
  class PanoptoIE(PanoptoBaseIE):
      _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)'
+    _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>{PanoptoBaseIE.BASE_URL_RE}/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)']
      _TESTS = [
          {
              'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb',
diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py

index 0d3bc18a8469e840fa0d61ec857108241eec3893..6d280e41c0e56648852589cc63fc78e77f8b78b1 100644 (file)
--- a/yt_dlp/extractor/peertube.py
+++ b/yt_dlp/extractor/peertube.py
@@ -1057,6 +1057,7 @@ class PeerTubeIE(InfoExtractor):
                      )
                      (?P<id>%s)
                      ''' % (_INSTANCES_RE, _UUID_RE)
+    _EMBED_REGEX = [r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//{_INSTANCES_RE}/videos/embed/{cls._UUID_RE})''']
      _TESTS = [{
          'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d',
          'md5': '8563064d245a4be5705bddb22bb00a28',
@@ -1158,16 +1159,15 @@ def _extract_peertube_url(webpage, source_url):
                  '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')):
              return 'peertube:%s:%s' % mobj.group('host', 'id')
  
-    @staticmethod
-    def _extract_urls(webpage, source_url):
-        entries = re.findall(
-            r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)'''
-            % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage)
-        if not entries:
-            peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url)
-            if peertube_url:
-                entries = [peertube_url]
-        return entries
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        embeds = tuple(super()._extract_embed_urls(url, webpage))
+        if embeds:
+            return embeds
+
+        peertube_url = cls._extract_peertube_url(webpage, url)
+        if peertube_url:
+            return [peertube_url]
  
      def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True):
          return self._download_json(
diff --git a/yt_dlp/extractor/periscope.py b/yt_dlp/extractor/periscope.py

index fc8591a2ce61c55a102cc841aae47b6e7f50f6dc..2ff6589d5f03b00f2f3f1b908fcd67370b2ff1ca 100644 (file)
--- a/yt_dlp/extractor/periscope.py
+++ b/yt_dlp/extractor/periscope.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      int_or_none,
@@ -67,6 +65,7 @@ class PeriscopeIE(PeriscopeBaseIE):
      IE_DESC = 'Periscope'
      IE_NAME = 'periscope'
      _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1']
      # Alive example URLs can be found here https://www.periscope.tv/
      _TESTS = [{
          'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
@@ -92,13 +91,6 @@ class PeriscopeIE(PeriscopeBaseIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          token = self._match_id(url)
  
diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py

index 14a5408598cc0afe881d389fdb12dbcb7eaf5116..fba7242f5dc46ebda0be8aff71f02229a2d222bc 100644 (file)
--- a/yt_dlp/extractor/piksel.py
+++ b/yt_dlp/extractor/piksel.py
@@ -30,6 +30,7 @@ class PikselIE(InfoExtractor):
              )\.jp|
              vidego\.baltimorecity\.gov
          )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)'''
+    _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)']
      _TESTS = [
          {
              'url': 'http://player.piksel.com/v/ums2867l',
@@ -62,14 +63,6 @@ class PikselIE(InfoExtractor):
          }
      ]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _call_api(self, app_token, resource, display_id, query, fatal=True):
          response = (self._download_json(
              'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token),
diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py

index 301f5c83875f1947d34535cb0c86d7259d05db94..8be08a5bc2ad9c9842c0f21b03066907e63e4f3e 100644 (file)
--- a/yt_dlp/extractor/pladform.py
+++ b/yt_dlp/extractor/pladform.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
@@ -24,6 +22,7 @@ class PladformIE(InfoExtractor):
                          )
                          (?P<id>\d+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1']
      _TESTS = [{
          'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282',
          'info_dict': {
@@ -61,13 +60,6 @@ class PladformIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/playwire.py b/yt_dlp/extractor/playwire.py

index ab7f71493fd01550f447701e568240d10e893e26..683dbf4a584a21166bcff4283c34f52467f21ba9 100644 (file)
--- a/yt_dlp/extractor/playwire.py
+++ b/yt_dlp/extractor/playwire.py
@@ -7,6 +7,8 @@
  
  class PlaywireIE(InfoExtractor):
      _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1']
+
      _TESTS = [{
          'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json',
          'md5': 'e6398701e3595888125729eaa2329ed9',
diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py

index 35468b4fc801f7577fae6565b2663bd471365f1a..6afaf5e6e7f5fd3c451f9b4549caf243629c2cb8 100644 (file)
--- a/yt_dlp/extractor/pornhub.py
+++ b/yt_dlp/extractor/pornhub.py
@@ -128,6 +128,7 @@ class PornHubIE(PornHubBaseIE):
                          )
                          (?P<id>[\da-z]+)
                      ''' % PornHubBaseIE._PORNHUB_HOST_RE
+    _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
      _TESTS = [{
          'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
          'md5': 'a6391306d050e4547f62b3f485dd9ba9',
@@ -257,12 +258,6 @@ class PornHubIE(PornHubBaseIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)',
-            webpage)
-
      def _extract_count(self, pattern, webpage, name):
          return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None))
  
diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py

index abbc167c0fc9f2b7fb0e6e7045ce13181843f8cf..28ba42eedcbde13b262ebf409911af21dfa18688 100644 (file)
--- a/yt_dlp/extractor/rcs.py
+++ b/yt_dlp/extractor/rcs.py
@@ -281,6 +281,20 @@ class RCSEmbedsIE(RCSBaseIE):
                          (?:gazzanet\.)?gazzetta
                      )\.it)
                      /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)'''
+    _EMBED_REGEX = [r'''(?x)
+            (?:
+                data-frame-src=|
+                <iframe[^\n]+src=
+            )
+            (["'])
+                (?P<url>(?:https?:)?//video\.
+                    (?:
+                        rcs|
+                        (?:corriere\w+\.)?corriere|
+                        (?:gazzanet\.)?gazzetta
+                    )
+                \.it/video-embed/.+?)
+            \1''']
      _TESTS = [{
          'url': 'https://video.rcs.it/video-embed/iodonna-0001585037',
          'md5': '623ecc8ffe7299b2d0c1046d8331a9df',
@@ -321,30 +335,9 @@ def _sanitize_urls(urls):
              urls[i] = urljoin(base_url(e), url_basename(e))
          return urls
  
-    @staticmethod
-    def _extract_urls(webpage):
-        entries = [
-            mobj.group('url')
-            for mobj in re.finditer(r'''(?x)
-            (?:
-                data-frame-src=|
-                <iframe[^\n]+src=
-            )
-            (["'])
-                (?P<url>(?:https?:)?//video\.
-                    (?:
-                        rcs|
-                        (?:corriere\w+\.)?corriere|
-                        (?:gazzanet\.)?gazzetta
-                    )
-                \.it/video-embed/.+?)
-            \1''', webpage)]
-        return RCSEmbedsIE._sanitize_urls(entries)
-
-    @staticmethod
-    def _extract_url(webpage):
-        urls = RCSEmbedsIE._extract_urls(webpage)
-        return urls[0] if urls else None
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage)))
  
  
  class RCSIE(RCSBaseIE):
diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py

index ab7c505dab28fd351675bcf503c92f972d2c121c..8e767b6e439e4a590689b3c6d4d32c8729931578 100644 (file)
--- a/yt_dlp/extractor/redtube.py
+++ b/yt_dlp/extractor/redtube.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
@@ -14,6 +12,7 @@
  
  class RedTubeIE(InfoExtractor):
      _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)']
      _TESTS = [{
          'url': 'https://www.redtube.com/38864951',
          'md5': '4fba70cbca3aefd25767ab4b523c9878',
@@ -37,12 +36,6 @@ class RedTubeIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(
diff --git a/yt_dlp/extractor/rtlnl.py b/yt_dlp/extractor/rtlnl.py

index e6b450a239e986c63c8dd0c367ca50ea6ae37b82..3852a3a1353f624d016ad81dd67697b6c0ba72cc 100644 (file)
--- a/yt_dlp/extractor/rtlnl.py
+++ b/yt_dlp/extractor/rtlnl.py
@@ -8,6 +8,7 @@
  class RtlNlIE(InfoExtractor):
      IE_NAME = 'rtl.nl'
      IE_DESC = 'rtl.nl and rtlxl.nl'
+    _EMBED_REGEX = [r'<iframe[^>]+?\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)(?P=q1)']
      _VALID_URL = r'''(?x)
          https?://(?:(?:www|static)\.)?
          (?:
diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py

index 924f9829fa5bf7d9da939fa428d30a7b5f1ca61a..c94ba68ee04c47869e6b760182477f7ff0b51d5e 100644 (file)
--- a/yt_dlp/extractor/rumble.py
+++ b/yt_dlp/extractor/rumble.py
@@ -15,6 +15,7 @@
  
  class RumbleEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
+    _EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})']
      _TESTS = [{
          'url': 'https://rumble.com/embed/v5pv5f',
          'md5': '36a18a049856720189f30977ccbb2c34',
@@ -51,11 +52,10 @@ class RumbleEmbedIE(InfoExtractor):
      }]
  
      @classmethod
-    def _extract_urls(cls, webpage):
-        embeds = tuple(re.finditer(
-            fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{cls._VALID_URL})', webpage))
+    def _extract_embed_urls(cls, url, webpage):
+        embeds = tuple(super()._extract_embed_urls(url, webpage))
          if embeds:
-            return [mobj.group('url') for mobj in embeds]
+            return embeds
          return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
              r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
  
diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py

index ecfcea9393761f96bb6b3db8fc101d89284e232b..380c5e14e85e5fd4d9e58adc03bd57269a0d788c 100644 (file)
--- a/yt_dlp/extractor/rutube.py
+++ b/yt_dlp/extractor/rutube.py
@@ -1,4 +1,3 @@
-import re
  import itertools
  
  from .common import InfoExtractor
@@ -94,6 +93,7 @@ class RutubeIE(RutubeBaseIE):
      IE_NAME = 'rutube'
      IE_DESC = 'Rutube videos'
      _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1']
  
      _TESTS = [{
          'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
@@ -128,12 +128,6 @@ class RutubeIE(RutubeBaseIE):
      def suitable(cls, url):
          return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url)
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [mobj.group('url') for mobj in re.finditer(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1',
-            webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          info = self._download_and_extract_info(video_id)
diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py

index adf78ddb095142b30fa900b40063d42fbca73334..0b07dc5ad875c4051ccbe3bee868b5336af526cc 100644 (file)
--- a/yt_dlp/extractor/rutv.py
+++ b/yt_dlp/extractor/rutv.py
@@ -20,6 +20,10 @@ class RUTVIE(InfoExtractor):
                          )
                          (?P<id>\d+)
                      '''
+    _EMBED_URLS = [
+        r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1',
+        r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
+    ]
  
      _TESTS = [
          {
@@ -107,19 +111,6 @@ class RUTVIE(InfoExtractor):
          },
      ]
  
-    @classmethod
-    def _extract_url(cls, webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
-        mobj = re.search(
-            r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('id')
diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py

index c6d94c10020081b1f3d928add126bfad6419d4c7..3f6d30d3c42f13eadd56465be770de14cde458c1 100644 (file)
--- a/yt_dlp/extractor/ruutu.py
+++ b/yt_dlp/extractor/ruutu.py
@@ -135,7 +135,7 @@ class RuutuIE(InfoExtractor):
      _API_BASE = 'https://gatling.nelonenmedia.fi'
  
      @classmethod
-    def _extract_urls(cls, webpage):
+    def _extract_embed_urls(cls, url, webpage):
          # nelonen.fi
          settings = try_call(
              lambda: json.loads(re.search(
diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py

index 711524406b6396caae41debe105727284dcf85f1..6bb499930c4555a302a730c85594fb53f9b387c6 100644 (file)
--- a/yt_dlp/extractor/sbs.py
+++ b/yt_dlp/extractor/sbs.py
@@ -15,6 +15,12 @@ class SBSIE(InfoExtractor):
                  .*?\bplay=|/watch/
              )|news/(?:embeds/)?video/
          )(?P<id>[0-9]+)'''
+    _EMBED_REGEX = [r'''(?x)]
+            (?:
+                <meta\s+property="og:video"\s+content=|
+                <iframe[^>]+?src=
+            )
+            (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''']
  
      _TESTS = [{
          # Original URL is handled by the generic IE which finds the iframe:
diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py

index bced143288ed48c44b20e2c507d18e676024afb7..6fec7c0bb3cbfb667d6fdf19525d6842c102bd3b 100644 (file)
--- a/yt_dlp/extractor/senategov.py
+++ b/yt_dlp/extractor/senategov.py
@@ -49,6 +49,7 @@
  class SenateISVPIE(InfoExtractor):
      _IE_NAME = 'senate.gov:isvp'
      _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
+    _EMBED_REGEX = [r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"]
  
      _TESTS = [{
          'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
@@ -87,14 +88,6 @@ class SenateISVPIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _search_iframe_url(webpage):
-        mobj = re.search(
-            r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
  
diff --git a/yt_dlp/extractor/sendtonews.py b/yt_dlp/extractor/sendtonews.py

index cf4b93d4543f0023ba5a48a52fd575c548cd7b1e..5ff06f19d58b2c61f976fc5bfc348984a1c6b97c 100644 (file)
--- a/yt_dlp/extractor/sendtonews.py
+++ b/yt_dlp/extractor/sendtonews.py
@@ -43,14 +43,14 @@ class SendtoNewsIE(InfoExtractor):
      _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s'
  
      @classmethod
-    def _extract_url(cls, webpage):
+    def _extract_embed_urls(cls, url, webpage):
          mobj = re.search(r'''(?x)<script[^>]+src=([\'"])
              (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\?
                  .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
              \1>''', webpage)
          if mobj:
              sc = mobj.group('SC')
-            return cls._URL_TEMPLATE % sc
+            yield cls._URL_TEMPLATE % sc
  
      def _real_extract(self, url):
          playlist_id = self._match_id(url)
diff --git a/yt_dlp/extractor/seznamzpravy.py b/yt_dlp/extractor/seznamzpravy.py

index 891bfcfee61a89f337194e0d4b5fa51d5c9abc20..05642a1168df7972b7315dd43799750244d72969 100644 (file)
--- a/yt_dlp/extractor/seznamzpravy.py
+++ b/yt_dlp/extractor/seznamzpravy.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import (
      compat_str,
@@ -20,6 +18,7 @@ def _raw_id(src_url):
  
  class SeznamZpravyIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc='
+    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1']
      _TESTS = [{
          'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5&sectionPrefixPreroll=%2Fzpravy',
          'info_dict': {
@@ -48,13 +47,6 @@ class SeznamZpravyIE(InfoExtractor):
          },
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url') for mobj in re.finditer(
-                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1',
-                webpage)]
-
      def _extract_sdn_formats(self, sdn_url, video_id):
          sdn_data = self._download_json(sdn_url, video_id)
  
@@ -162,5 +154,5 @@ def _real_extract(self, url):
  
          return self.playlist_result([
              self.url_result(entry_url, ie=SeznamZpravyIE.ie_key())
-            for entry_url in SeznamZpravyIE._extract_urls(webpage)],
+            for entry_url in SeznamZpravyIE._extract_embed_urls(url, webpage)],
              article_id, title, description)
diff --git a/yt_dlp/extractor/sharevideos.py b/yt_dlp/extractor/sharevideos.py

new file mode 100644 (file)

index 0000000..3132c7a
--- /dev/null
+++ b/yt_dlp/extractor/sharevideos.py
@@ -0,0 +1,6 @@
+from .common import InfoExtractor
+
+
+class ShareVideosEmbedIE(InfoExtractor):
+    _VALID_URL = False
+    _EMBED_REGEX = [r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1']
diff --git a/yt_dlp/extractor/simplecast.py b/yt_dlp/extractor/simplecast.py

index ecbb6123b80018e29c9cb99f393550afd5a49bee..ec349ddf900d6ba5e5750bd32be9cda4589c397c 100644 (file)
--- a/yt_dlp/extractor/simplecast.py
+++ b/yt_dlp/extractor/simplecast.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      clean_podcast_url,
@@ -68,6 +66,11 @@ def _parse_episode(self, episode):
  class SimplecastIE(SimplecastBaseIE):
      IE_NAME = 'simplecast'
      _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX
+    _EMBED_REGEX = [rf'''(?x)<iframe[^>]+src=["\']
+        (?P<url>https?://(?:
+            embed\.simplecast\.com/[0-9a-f]{8}|
+            player\.simplecast\.com/{SimplecastBaseIE._UUID_REGEX}
+        ))''']
      _COMMON_TEST_INFO = {
          'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
          'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
@@ -94,15 +97,6 @@ class SimplecastIE(SimplecastBaseIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'''(?x)<iframe[^>]+src=["\']
-                (
-                    https?://(?:embed\.simplecast\.com/[0-9a-f]{8}|
-                    player\.simplecast\.com/%s
-                ))''' % SimplecastBaseIE._UUID_REGEX, webpage)
-
      def _real_extract(self, url):
          episode_id = self._match_id(url)
          episode = self._call_api('episodes/%s', episode_id)
diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py

index 9e4c8cf256f73140a7a914cf18711e33376da14c..f7e125d3735a741297c42e482404130833603ae5 100644 (file)
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -33,18 +33,13 @@
  
  class SoundcloudEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1']
      _TEST = {
          # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/
          'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey',
          'only_matching': True,
      }
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [m.group('url') for m in re.finditer(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
-            webpage)]
-
      def _real_extract(self, url):
          query = parse_qs(url)
          api_url = query['url'][0]
diff --git a/yt_dlp/extractor/spankwire.py b/yt_dlp/extractor/spankwire.py

index 603f17e9daf2afdf63cdde862afc82d4d4fd9115..d1990e4de7dbeb6519f7ab7c357619a91f7effc3 100644 (file)
--- a/yt_dlp/extractor/spankwire.py
+++ b/yt_dlp/extractor/spankwire.py
@@ -21,6 +21,7 @@ class SpankwireIE(InfoExtractor):
                          )
                          (?P<id>\d+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)']
      _TESTS = [{
          # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4
          'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
@@ -65,12 +66,6 @@ class SpankwireIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)',
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/sportbox.py b/yt_dlp/extractor/sportbox.py

index 1041cc7d18e6e755ac3bada18a12170b014d414a..622a81b47c128d4b144254843020c65f9da5152f 100644 (file)
--- a/yt_dlp/extractor/sportbox.py
+++ b/yt_dlp/extractor/sportbox.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
@@ -11,6 +9,7 @@
  
  class SportBoxIE(InfoExtractor):
      _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"']
      _TESTS = [{
          'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
          'info_dict': {
@@ -42,12 +41,6 @@ class SportBoxIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"',
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/spotify.py b/yt_dlp/extractor/spotify.py

index f476b7022c7e787724a8f27502f6039af86fe688..4da24db9e9a0b50bd48937f8815454c560ced867 100644 (file)
--- a/yt_dlp/extractor/spotify.py
+++ b/yt_dlp/extractor/spotify.py
@@ -23,6 +23,7 @@ class SpotifyBaseIE(InfoExtractor):
          'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d',
      }
      _VALID_URL_TEMPL = r'https?://open\.spotify\.com/(?:embed-podcast/|embed/|)%s/(?P<id>[^/?&#]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://open\.spotify.com/embed/[^"]+)"']
  
      def _real_initialize(self):
          self._ACCESS_TOKEN = self._download_json(
@@ -97,12 +98,6 @@ def _extract_episode(self, episode, series):
              'series': series,
          }
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        return re.findall(
-            r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
-            webpage)
-
  
  class SpotifyIE(SpotifyBaseIE):
      IE_NAME = 'spotify'
diff --git a/yt_dlp/extractor/springboardplatform.py b/yt_dlp/extractor/springboardplatform.py

index 8e156bf1a189ce5515bc5489140eab04fb0e39ec..539a6420943e04a987480f797ca2d3f3bc7fe80b 100644 (file)
--- a/yt_dlp/extractor/springboardplatform.py
+++ b/yt_dlp/extractor/springboardplatform.py
@@ -21,6 +21,7 @@ class SpringboardPlatformIE(InfoExtractor):
                              xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+)
                          )
                      '''
+    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1']
      _TESTS = [{
          'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1',
          'md5': '5c3cb7b5c55740d482561099e920f192',
@@ -45,14 +46,6 @@ class SpringboardPlatformIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1',
-                webpage)]
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('id') or mobj.group('id_2')
diff --git a/yt_dlp/extractor/streamable.py b/yt_dlp/extractor/streamable.py

index a2935b04bbea3f5e771bec76be8b803422c6f335..3e60479ad04eb0ba76e237406122218c71288411 100644 (file)
--- a/yt_dlp/extractor/streamable.py
+++ b/yt_dlp/extractor/streamable.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      ExtractorError,
@@ -12,6 +10,7 @@
  
  class StreamableIE(InfoExtractor):
      _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//streamable\.com/.+?)(?P=q1)']
      _TESTS = [
          {
              'url': 'https://streamable.com/dnd1',
@@ -53,14 +52,6 @@ class StreamableIE(InfoExtractor):
          }
      ]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)',
-            webpage)
-        if mobj:
-            return mobj.group('src')
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py

index 70cf10515c8fd0d57252bf34298daef162032f3e..787b9f70d68d51964caa9f1343e24a8168e5f238 100644 (file)
--- a/yt_dlp/extractor/substack.py
+++ b/yt_dlp/extractor/substack.py
@@ -46,14 +46,15 @@ class SubstackIE(InfoExtractor):
      }]
  
      @classmethod
-    def _extract_url(cls, webpage, url):
+    def _extract_embed_urls(cls, url, webpage):
          if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage):
              return
  
          mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage)
          if mobj:
              parsed = urllib.parse.urlparse(url)
-            return parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl()
+            yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl()
+            raise cls.StopExtraction()
  
      def _extract_video_formats(self, video_id, username):
          formats, subtitles = [], {}
diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py

index e0c436b67ae3368815c6b3a195319aaccf8e95ff..b422b6d93467d75e52ff1386ab672999105c6b7e 100644 (file)
--- a/yt_dlp/extractor/svt.py
+++ b/yt_dlp/extractor/svt.py
@@ -101,6 +101,7 @@ def _extract_video(self, video_info, video_id):
  
  class SVTIE(SVTBaseIE):
      _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
+    _EMBED_REGEX = [r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % _VALID_URL]
      _TEST = {
          'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
          'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
@@ -113,13 +114,6 @@ class SVTIE(SVTBaseIE):
          },
      }
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          widget_id = mobj.group('widget_id')
diff --git a/yt_dlp/extractor/teachable.py b/yt_dlp/extractor/teachable.py

index e480d7610145f912f48cb1cb260825257fd2837b..c212a4926e8df10bc47a0d05edc559baa9dde67b 100644 (file)
--- a/yt_dlp/extractor/teachable.py
+++ b/yt_dlp/extractor/teachable.py
@@ -140,12 +140,12 @@ def _is_teachable(webpage):
              r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com',
              webpage)
  
-    @staticmethod
-    def _extract_url(webpage, source_url):
-        if not TeachableIE._is_teachable(webpage):
-            return
-        if re.match(r'https?://[^/]+/(?:courses|p)', source_url):
-            return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url)
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        if cls._is_teachable(webpage):
+            if re.match(r'https?://[^/]+/(?:courses|p)', url):
+                yield f'{cls._URL_PREFIX}{url}'
+                raise cls.StopExtraction()
  
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
@@ -160,7 +160,7 @@ def _real_extract(self, url):
  
          webpage = self._download_webpage(url, video_id)
  
-        wistia_urls = WistiaIE._extract_urls(webpage)
+        wistia_urls = WistiaIE._extract_embed_urls(url, webpage)
          if not wistia_urls:
              if any(re.search(p, webpage) for p in (
                      r'class=["\']lecture-contents-locked',
diff --git a/yt_dlp/extractor/ted.py b/yt_dlp/extractor/ted.py

index b5c7e35ac28a36b66c50ed050325dc5801e45011..0e09ec75758f3560872189ef47d5714a1b0506da 100644 (file)
--- a/yt_dlp/extractor/ted.py
+++ b/yt_dlp/extractor/ted.py
@@ -215,6 +215,7 @@ def _real_extract(self, url):
  
  class TedEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/'
+    _EMBED_REGEX = [rf'<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL}.+?)\1']
  
      _TESTS = [{
          'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
@@ -233,10 +234,5 @@ class TedEmbedIE(InfoExtractor):
          },
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        return [mobj.group('url') for mobj in re.finditer(
-            fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)]
-
      def _real_extract(self, url):
          return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key())
diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py

index bf7efc0134da41f8fad6a999b68741dc98d5f89e..c8026d2941340b67c94ea3c2393dc99c2aa87594 100644 (file)
--- a/yt_dlp/extractor/theplatform.py
+++ b/yt_dlp/extractor/theplatform.py
@@ -123,6 +123,13 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
          (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
             (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
           |theplatform:)(?P<id>[^/\?&]+)'''
+    _EMBED_REGEX = [
+        r'''(?x)
+            <meta\s+
+                property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''',
+        r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1'
+    ]
  
      _TESTS = [{
          # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
@@ -192,22 +199,11 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
      }]
  
      @classmethod
-    def _extract_urls(cls, webpage):
-        m = re.search(
-            r'''(?x)
-                    <meta\s+
-                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
-                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
-            ''', webpage)
-        if m:
-            return [m.group('url')]
-
+    def _extract_embed_urls(cls, url, webpage):
          # Are whitespaces ignored in URLs?
          # https://github.com/ytdl-org/youtube-dl/issues/12044
-        matches = re.findall(
-            r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
-        if matches:
-            return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield re.sub(r'\s', '', embed_url)
  
      @staticmethod
      def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py

index 1c0baf5ed49cf59d8270be8dae5bfe3d99627a0e..a313a8dfbe6332a0b9c94d6cdf8fdd940777b0a7 100644 (file)
--- a/yt_dlp/extractor/threeqsdn.py
+++ b/yt_dlp/extractor/threeqsdn.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import compat_HTTPError
  from ..utils import (
@@ -16,6 +14,7 @@ class ThreeQSDNIE(InfoExtractor):
      IE_NAME = '3qsdn'
      IE_DESC = '3Q SDN'
      _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _EMBED_REGEX = [r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % _VALID_URL]
      _TESTS = [{
          # https://player.3qsdn.com/demo.html
          'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be',
@@ -76,12 +75,13 @@ class ThreeQSDNIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage)
-        if mobj:
-            return mobj.group('url')
+    def _extract_from_webpage(self, url, webpage):
+        for res in super()._extract_from_webpage(url, webpage):
+            yield {
+                **res,
+                '_type': 'url_transparent',
+                'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'),
+            }
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py

index 680358d5ece1149b99d310000a113e4197a8fed2..3ac76527085826a8c7b4d89b75c52ad872ecfad7 100644 (file)
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -1,7 +1,6 @@
  import itertools
  import json
  import random
-import re
  import string
  import time
  
@@ -379,6 +378,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url):
  
  class TikTokIE(TikTokBaseIE):
      _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)'
+    _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
  
      _TESTS = [{
          'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
@@ -529,11 +529,6 @@ class TikTokIE(TikTokBaseIE):
          'only_matching': True
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        return [mobj.group('url') for mobj in re.finditer(
-            rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{cls._VALID_URL})', webpage)]
-
      def _extract_aweme_app(self, aweme_id):
          try:
              aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,
diff --git a/yt_dlp/extractor/tnaflix.py b/yt_dlp/extractor/tnaflix.py

index 6b766f3cc6be0aeb0417b09ae42345d06fd58850..34361e515aac8a04fe612d8fa04de3611bffb0d1 100644 (file)
--- a/yt_dlp/extractor/tnaflix.py
+++ b/yt_dlp/extractor/tnaflix.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import compat_str
  from ..utils import (
@@ -173,6 +171,7 @@ def extract_field(pattern, name):
  
  class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
      _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1']
  
      _TITLE_REGEX = r'<title>([^<]+)</title>'
  
@@ -194,12 +193,6 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [url for _, url in re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1',
-            webpage)]
-
  
  class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE):
      _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<'
diff --git a/yt_dlp/extractor/tube8.py b/yt_dlp/extractor/tube8.py

index 32e80d9d2a9479ce03576843a8401dfe1e12df9f..b092ecad5a8779703de2aa5af159b0011dfffbd0 100644 (file)
--- a/yt_dlp/extractor/tube8.py
+++ b/yt_dlp/extractor/tube8.py
@@ -9,6 +9,7 @@
  
  class Tube8IE(KeezMoviesIE):
      _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)']
      _TESTS = [{
          'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
          'md5': '65e20c48e6abff62ed0c3965fff13a39',
@@ -29,12 +30,6 @@ class Tube8IE(KeezMoviesIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)',
-            webpage)
-
      def _real_extract(self, url):
          webpage, info = self._extract_info(url)
  
diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py

index e3d3f2a96bfd4ee5ca196b1ad90e755b0ff6b274..f163eaf0957c338d1311300a9295e291cee02b24 100644 (file)
--- a/yt_dlp/extractor/tunein.py
+++ b/yt_dlp/extractor/tunein.py
@@ -8,12 +8,6 @@
  class TuneInBaseIE(InfoExtractor):
      _API_BASE_URL = 'http://tunein.com/tuner/tune/'
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)',
-            webpage)
-
      def _real_extract(self, url):
          content_id = self._match_id(url)
  
@@ -86,6 +80,7 @@ class TuneInClipIE(TuneInBaseIE):
  class TuneInStationIE(TuneInBaseIE):
      IE_NAME = 'tunein:station'
      _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)']
      _API_URL_QUERY = '?tuneType=Station&stationId=%s'
  
      @classmethod
diff --git a/yt_dlp/extractor/tvc.py b/yt_dlp/extractor/tvc.py

index 4ccc8f5227bada759c22fcbf404393dd07cc46b2..1ef64caf9a9fc263a6e1f37a83a8c95b36118c1d 100644 (file)
--- a/yt_dlp/extractor/tvc.py
+++ b/yt_dlp/extractor/tvc.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      clean_html,
@@ -9,6 +7,7 @@
  
  class TVCIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1']
      _TEST = {
          'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
          'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
@@ -21,13 +20,6 @@ class TVCIE(InfoExtractor):
          },
      }
  
-    @classmethod
-    def _extract_url(cls, webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/tvigle.py b/yt_dlp/extractor/tvigle.py

index cc1d35dc239b29872e941addd5cdbc2e9c749d02..9a7cb7214ceafdeaf2f1259832a0ad2cae278e56 100644 (file)
--- a/yt_dlp/extractor/tvigle.py
+++ b/yt_dlp/extractor/tvigle.py
@@ -13,6 +13,7 @@ class TvigleIE(InfoExtractor):
      IE_NAME = 'tvigle'
      IE_DESC = 'Интернет-телевидение Tvigle.ru'
      _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$|cloud\.tvigle\.ru/video/(?P<id>\d+))'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1']
  
      _GEO_BYPASS = False
      _GEO_COUNTRIES = ['RU']
diff --git a/yt_dlp/extractor/tvopengr.py b/yt_dlp/extractor/tvopengr.py

index aded261f36615e1161181bf98026341782a23633..d8be12c964c122de308f4e67f962978a4e39c645 100644 (file)
--- a/yt_dlp/extractor/tvopengr.py
+++ b/yt_dlp/extractor/tvopengr.py
@@ -1,11 +1,8 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
      get_elements_text_and_html_by_attribute,
      scale_thumbnails_to_max_format_width,
-    unescapeHTML,
  )
  
  
@@ -98,7 +95,7 @@ class TVOpenGrEmbedIE(TVOpenGrBaseIE):
      IE_NAME = 'tvopengr:embed'
      IE_DESC = 'tvopen.gr embedded videos'
      _VALID_URL = r'(?:https?:)?//(?:www\.|cdn\.|)(?:tvopen|ethnos).gr/embed/(?P<id>\d+)'
-    _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
+    _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''']
  
      _TESTS = [{
          'url': 'https://cdn.ethnos.gr/embed/100963',
@@ -115,11 +112,6 @@ class TVOpenGrEmbedIE(TVOpenGrBaseIE):
          },
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        for mobj in cls._EMBED_RE.finditer(webpage):
-            yield unescapeHTML(mobj.group('url'))
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          return self._return_canonical_url(url, video_id)
diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py

index 69168f6558bde624db53d63439307da3f3c4b0f8..f1bc0fbba43bf9f3bf461a2e60aeb0a7dbb32831 100644 (file)
--- a/yt_dlp/extractor/tvp.py
+++ b/yt_dlp/extractor/tvp.py
@@ -310,6 +310,7 @@ class TVPEmbedIE(InfoExtractor):
                  =)
          (?P<id>\d+)
      '''
+    _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL[4:]})']
  
      _TESTS = [{
          'url': 'tvp:194536',
@@ -340,12 +341,6 @@ class TVPEmbedIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage, **kw):
-        return [m.group('embed') for m in re.finditer(
-            r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
-            webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/twentymin.py b/yt_dlp/extractor/twentymin.py

index 616c3c36ed9cb417338fe89af204d6d6332d464c..f33f15914a12708364c67e012d3902f71409f186 100644 (file)
--- a/yt_dlp/extractor/twentymin.py
+++ b/yt_dlp/extractor/twentymin.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      int_or_none,
@@ -18,6 +16,7 @@ class TwentyMinutenIE(InfoExtractor):
                          )
                          (?P<id>\d+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1']
      _TESTS = [{
          'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2',
          'md5': 'e7264320db31eed8c38364150c12496e',
@@ -44,12 +43,6 @@ class TwentyMinutenIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [m.group('url') for m in re.finditer(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
-            webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/udn.py b/yt_dlp/extractor/udn.py

index 4fa74b9e84c87f7f9cf603f197a19c8d0e8b314f..9fdb46faf6c9b4bb88896cb8896cbae254141df1 100644 (file)
--- a/yt_dlp/extractor/udn.py
+++ b/yt_dlp/extractor/udn.py
@@ -13,6 +13,7 @@ class UDNEmbedIE(InfoExtractor):
      IE_DESC = '聯合影音'
      _PROTOCOL_RELATIVE_VALID_URL = r'//video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
      _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
+    _EMBED_REGEX = [r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % _PROTOCOL_RELATIVE_VALID_URL]
      _TESTS = [{
          'url': 'http://video.udn.com/embed/news/300040',
          'info_dict': {
diff --git a/yt_dlp/extractor/ustream.py b/yt_dlp/extractor/ustream.py

index fff21667aa2c08ad6d670f5f2e83a2521eda994d..cb920bf13b9dd3555cbe716f6a2d44e429e184c5 100644 (file)
--- a/yt_dlp/extractor/ustream.py
+++ b/yt_dlp/extractor/ustream.py
@@ -20,6 +20,7 @@
  class UstreamIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
      IE_NAME = 'ustream'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1']
      _TESTS = [{
          'url': 'http://www.ustream.tv/recorded/20274954',
          'md5': '088f151799e8f572f84eb62f17d73e5c',
@@ -71,13 +72,6 @@ class UstreamIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
-        if mobj is not None:
-            return mobj.group('url')
-
      def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None):
          def num_to_hex(n):
              return hex(n)[2:]
diff --git a/yt_dlp/extractor/vbox7.py b/yt_dlp/extractor/vbox7.py

index 76c844cb8748f11a4d61981fb9b4d34f289053b5..be35dad1c3a14327c5dec004f9c07ae8f810b103 100644 (file)
--- a/yt_dlp/extractor/vbox7.py
+++ b/yt_dlp/extractor/vbox7.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import ExtractorError
  
@@ -17,6 +15,7 @@ class Vbox7IE(InfoExtractor):
                          )
                          (?P<id>[\da-fA-F]+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)']
      _GEO_COUNTRIES = ['BG']
      _TESTS = [{
          'url': 'http://vbox7.com/play:0946fff23c',
@@ -51,14 +50,6 @@ class Vbox7IE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/vevo.py b/yt_dlp/extractor/vevo.py

index 825089f47145ec87a3e6cb1b369bd3e1e02f9e9b..a146be04862f9b4e71faf65b9f3b266bc62d90c4 100644 (file)
--- a/yt_dlp/extractor/vevo.py
+++ b/yt_dlp/extractor/vevo.py
@@ -36,6 +36,7 @@ class VevoIE(VevoBaseIE):
             https?://tv\.vevo\.com/watch/artist/(?:[^/]+)/|
             vevo:)
          (?P<id>[^&?#]+)'''
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1']
  
      _TESTS = [{
          'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py

index abb4a6fa0bd683edf7368b616132838a6db72660..f3ad56bf1151d0f08f9dfa0d528ceb4e99523891 100644 (file)
--- a/yt_dlp/extractor/vice.py
+++ b/yt_dlp/extractor/vice.py
@@ -2,7 +2,6 @@
  import hashlib
  import json
  import random
-import re
  import time
  
  from .adobepass import AdobePassIE
@@ -38,6 +37,7 @@ def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''
  class ViceIE(ViceBaseIE, AdobePassIE):
      IE_NAME = 'vice'
      _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})'
+    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})']
      _TESTS = [{
          'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7',
          'info_dict': {
@@ -103,17 +103,6 @@ class ViceIE(ViceBaseIE, AdobePassIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})',
-            webpage)
-
-    @staticmethod
-    def _extract_url(webpage):
-        urls = ViceIE._extract_urls(webpage)
-        return urls[0] if urls else None
-
      def _real_extract(self, url):
          locale, video_id = self._match_valid_url(url).groups()
  
diff --git a/yt_dlp/extractor/viddler.py b/yt_dlp/extractor/viddler.py

index f491b67efd615a1c2338a5de9c479fb8c81d3dff..d81a31375d6520279d5e7e0201cd78b23f2faa1e 100644 (file)
--- a/yt_dlp/extractor/viddler.py
+++ b/yt_dlp/extractor/viddler.py
@@ -7,6 +7,8 @@
  
  class ViddlerIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)(?:.+?\bsecret=(\d+))?'
+    _EMBED_REGEX = [r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1']
+
      _TESTS = [{
          'url': 'http://www.viddler.com/v/43903784',
          'md5': '9eee21161d2c7f5b39690c3e325fab2f',
diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py

index 9b05c86a5e10eee342a1dfc39741d68248c7862b..fa16da28b49eb6f194d9a9fcd9e9ffc4f3c37f9d 100644 (file)
--- a/yt_dlp/extractor/videa.py
+++ b/yt_dlp/extractor/videa.py
@@ -1,5 +1,4 @@
  import random
-import re
  import string
  import struct
  
@@ -29,6 +28,7 @@ class VideaIE(InfoExtractor):
                          )
                          (?P<id>[^?#&]+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1']
      _TESTS = [{
          'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ',
          'md5': '97a7af41faeaffd9f1fc864a7c7e7603',
@@ -74,12 +74,6 @@ class VideaIE(InfoExtractor):
      }]
      _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [url for _, url in re.findall(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1',
-            webpage)]
-
      @staticmethod
      def rc4(cipher_text, key):
          res = b''
diff --git a/yt_dlp/extractor/videomore.py b/yt_dlp/extractor/videomore.py

index 09d12d192e3734c66be1d1113d34fec8ec88dedf..2f81860bb7dc81c70077bf8416f8cc5c7c02e27b 100644 (file)
--- a/yt_dlp/extractor/videomore.py
+++ b/yt_dlp/extractor/videomore.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import (
      compat_str,
@@ -47,6 +45,12 @@ class VideomoreIE(InfoExtractor):
                          (?P<id>\d+)
                          (?:[/?#&]|\.(?:xml|json)|$)
                      '''
+    _EMBED_REGEX = [r'''(?x)
+        (?:
+            <iframe[^>]+src=([\'"])|
+            <object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=
+        )(?P<url>https?://videomore\.ru/[^?#"']+/\d+(?:\.xml)?)
+    ''']
      _TESTS = [{
          'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617',
          'md5': '44455a346edc0d509ac5b5a5b531dc35',
@@ -126,19 +130,6 @@ class VideomoreIE(InfoExtractor):
      }]
      _GEO_BYPASS = False
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1',
-            webpage)
-        if not mobj:
-            mobj = re.search(
-                r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)',
-                webpage)
-
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('sid') or mobj.group('id')
diff --git a/yt_dlp/extractor/videopress.py b/yt_dlp/extractor/videopress.py

index 3c5e27a9dcfed9a71844c490180e91293492fa8f..16965dfb0e710c883a6a8f6dcf0e8d83cdb7ac4e 100644 (file)
--- a/yt_dlp/extractor/videopress.py
+++ b/yt_dlp/extractor/videopress.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
@@ -17,6 +15,7 @@ class VideoPressIE(InfoExtractor):
      _ID_REGEX = r'[\da-zA-Z]{8}'
      _PATH_REGEX = r'video(?:\.word)?press\.com/embed/'
      _VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX)
+    _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>(?:https?://)?{_PATH_REGEX}{_ID_REGEX})']
      _TESTS = [{
          'url': 'https://videopress.com/embed/kUJmAcSf',
          'md5': '706956a6c875873d51010921310e4bc6',
@@ -39,12 +38,6 @@ class VideoPressIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX),
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py

index d081a2f125d61cb97bfac8d6cee7080a92baa62f..b630f9a6d74ec92885e08950d88242aff1f5a592 100644 (file)
--- a/yt_dlp/extractor/viewlift.py
+++ b/yt_dlp/extractor/viewlift.py
@@ -1,5 +1,4 @@
  import json
-import re
  
  from .common import InfoExtractor
  from ..compat import compat_HTTPError
@@ -63,6 +62,7 @@ def _call_api(self, site, path, video_id, url, query):
  class ViewLiftEmbedIE(ViewLiftBaseIE):
      IE_NAME = 'viewlift:embed'
      _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P<domain>%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX]
      _TESTS = [{
          'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
          'md5': '2924e9215c6eff7a55ed35b72276bd93',
@@ -89,14 +89,6 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX,
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _real_extract(self, url):
          domain, film_id = self._match_valid_url(url).groups()
          site = domain.split('.')[-2]
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py

index 9617343457d0d34743e1e0c5d03c69012c65e584..1c9e2453add114b1c363e58e882f70f6bc2425c7 100644 (file)
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -30,7 +30,6 @@
      unsmuggle_url,
      urlencode_postdata,
      urljoin,
-    unescapeHTML,
      urlhandle_detect_ext,
  )
  
@@ -328,6 +327,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
                          /?(?:[?&].*)?(?:[#].*)?$
                      '''
      IE_NAME = 'vimeo'
+    _EMBED_REGEX = [
+        # iframe
+        r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1',
+        # Embedded (swf embed) Vimeo player
+        r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1',
+        # Non-standard embedded Vimeo player
+        r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1',
+    ]
      _TESTS = [
          {
              'url': 'http://vimeo.com/56015672#at=0',
@@ -729,29 +736,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
          # vimeo embed with check-password page protected by Referer header
      ]
  
-    @staticmethod
-    def _extract_urls(url, webpage):
-        urls = []
-        # Look for embedded (iframe) Vimeo player
-        for mobj in re.finditer(
-                r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1',
-                webpage):
-            urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url))
-        PLAIN_EMBED_RE = (
-            # Look for embedded (swf embed) Vimeo player
-            r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1',
-            # Look more for non-standard embedded Vimeo player
-            r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1',
-        )
-        for embed_re in PLAIN_EMBED_RE:
-            for mobj in re.finditer(embed_re, webpage):
-                urls.append(mobj.group('url'))
-        return urls
-
-    @staticmethod
-    def _extract_url(url, webpage):
-        urls = VimeoIE._extract_urls(url, webpage)
-        return urls[0] if urls else None
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield cls._smuggle_referrer(embed_url, url)
  
      def _verify_player_video_password(self, url, video_id, headers):
          password = self._get_video_password()
@@ -1386,12 +1374,12 @@ def _real_extract(self, url):
  class VHXEmbedIE(VimeoBaseInfoExtractor):
      IE_NAME = 'vhx:embed'
      _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://embed\.vhx\.tv/videos/\d+[^"]*)"']
  
-    @staticmethod
-    def _extract_url(url, webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage)
-        return VimeoIE._smuggle_referrer(unescapeHTML(mobj.group(1)), url) if mobj else None
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield cls._smuggle_referrer(embed_url, url)
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
diff --git a/yt_dlp/extractor/vine.py b/yt_dlp/extractor/vine.py

index 947f5cdb698337692caeab70c5df58438d8ef2bd..8e57201f68fa852ae7b32499aa6ca7408b052675 100644 (file)
--- a/yt_dlp/extractor/vine.py
+++ b/yt_dlp/extractor/vine.py
@@ -10,6 +10,7 @@
  
  class VineIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P<id>\w+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))']
      _TESTS = [{
          'url': 'https://vine.co/v/b9KOOWX7HUx',
          'md5': '2f36fed6235b16da96ce9b4dc890940d',
diff --git a/yt_dlp/extractor/viqeo.py b/yt_dlp/extractor/viqeo.py

index d214223e9afac27e252d3404bd7f6e043f533737..574622fa92dc537faac100f994e15b6677fec3ed 100644 (file)
--- a/yt_dlp/extractor/viqeo.py
+++ b/yt_dlp/extractor/viqeo.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      int_or_none,
@@ -17,6 +15,7 @@ class ViqeoIE(InfoExtractor):
                          )
                          (?P<id>[\da-f]+)
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1']
      _TESTS = [{
          'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837',
          'md5': 'a169dd1a6426b350dca4296226f21e76',
@@ -35,14 +34,6 @@ class ViqeoIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1',
-                webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py

index bad0b4ff4207312c8a3f87830babc15a3b99d2d3..95ea63ffa13fd71c6110ce63b892d124ce9fb76b 100644 (file)
--- a/yt_dlp/extractor/vk.py
+++ b/yt_dlp/extractor/vk.py
@@ -85,6 +85,7 @@ def _download_payload(self, path, video_id, data, fatal=True):
  class VKIE(VKBaseIE):
      IE_NAME = 'vk'
      IE_DESC = 'VK'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1']
      _VALID_URL = r'''(?x)
                      https?://
                          (?:
@@ -100,6 +101,8 @@ class VKIE(VKBaseIE):
                              (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))?
                          )
                      '''
+    # https://help.sibnet.ru/?sibnet_video_embed
+    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1']
      _TESTS = [
          {
              'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
@@ -344,13 +347,6 @@ class VKIE(VKBaseIE):
              'only_matching': True,
          }]
  
-    @staticmethod
-    def _extract_sibnet_urls(webpage):
-        # https://help.sibnet.ru/?sibnet_video_embed
-        return [unescapeHTML(mobj.group('url')) for mobj in re.finditer(
-            r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1',
-            webpage)]
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('videoid')
@@ -451,7 +447,7 @@ def _real_extract(self, url):
                  m_rutube.group(1).replace('\\', ''))
              return self.url_result(rutube_url)
  
-        dailymotion_urls = DailymotionIE._extract_urls(info_page)
+        dailymotion_urls = DailymotionIE._extract_embed_urls(url, info_page)
          if dailymotion_urls:
              return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
  
@@ -459,7 +455,7 @@ def _real_extract(self, url):
          if odnoklassniki_url:
              return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
  
-        sibnet_urls = self._extract_sibnet_urls(info_page)
+        sibnet_urls = self._extract_embed_urls(url, info_page)
          if sibnet_urls:
              return self.url_result(sibnet_urls[0])
  
diff --git a/yt_dlp/extractor/vodplatform.py b/yt_dlp/extractor/vodplatform.py

index 2b45dcd866181902b864d20cadd820095ea7114f..0d3e7eec2108257f3f512562e4a289b5a2d3ba5c 100644 (file)
--- a/yt_dlp/extractor/vodplatform.py
+++ b/yt_dlp/extractor/vodplatform.py
@@ -4,6 +4,7 @@
  
  class VODPlatformIE(InfoExtractor):
      _VALID_URL = r'https?://(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/(?P<id>[^/?#]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1']
      _TESTS = [{
          # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar
          'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw',
diff --git a/yt_dlp/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py

index a7bf298aaf753cae853b4658572896fa79b4de00..96c782d8b3e45a10c425a42bed14070c3f1d99be 100644 (file)
--- a/yt_dlp/extractor/voxmedia.py
+++ b/yt_dlp/extractor/voxmedia.py
@@ -71,6 +71,7 @@ def _real_extract(self, url):
  
  class VoxMediaIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked|funnyordie)\.com|recode\.net)/(?:[^/]+/)*(?P<id>[^/?]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src="(?P<url>https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"']
      _TESTS = [{
          # Volume embed, Youtube
          'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of',
diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py

index fd5226bbc650e34fd13815ed66f6404d5ad4dea0..93842db794f4b2115165bdacf353ea78c71a56d1 100644 (file)
--- a/yt_dlp/extractor/vshare.py
+++ b/yt_dlp/extractor/vshare.py
@@ -1,11 +1,10 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import ExtractorError, decode_packed_codes
  
  
  class VShareIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)']
      _TESTS = [{
          'url': 'https://vshare.io/d/0f64ce6',
          'md5': '17b39f55b5497ae8b59f5fbce8e35886',
@@ -19,12 +18,6 @@ class VShareIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)',
-            webpage)
-
      def _extract_packed(self, webpage):
          packed = self._search_regex(
              r'(eval\(function.+)', webpage, 'packed code')
diff --git a/yt_dlp/extractor/vzaar.py b/yt_dlp/extractor/vzaar.py

index 7ce0ba9f56e41189ea0766603c84d0c5e9c5873d..df43caf38064d18e68b7da68456edc6bdc265a35 100644 (file)
--- a/yt_dlp/extractor/vzaar.py
+++ b/yt_dlp/extractor/vzaar.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..compat import compat_str
  from ..utils import (
@@ -12,6 +10,7 @@
  
  class VzaarIE(InfoExtractor):
      _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)']
      _TESTS = [{
          # HTTP and HLS
          'url': 'https://vzaar.com/videos/1152805',
@@ -47,12 +46,6 @@ class VzaarIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)',
-            webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          video_data = self._download_json(
diff --git a/yt_dlp/extractor/washingtonpost.py b/yt_dlp/extractor/washingtonpost.py

index 7274eaa396401d10913374f41540866e4070987d..74501b1d2913b1c3ed814e4407911772c283bf5f 100644 (file)
--- a/yt_dlp/extractor/washingtonpost.py
+++ b/yt_dlp/extractor/washingtonpost.py
@@ -8,7 +8,7 @@
  class WashingtonPostIE(InfoExtractor):
      IE_NAME = 'washingtonpost'
      _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
-    _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})']
      _TESTS = [{
          'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
          'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
@@ -28,11 +28,6 @@ class WashingtonPostIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        return re.findall(
-            r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          return self.url_result(
diff --git a/yt_dlp/extractor/webcaster.py b/yt_dlp/extractor/webcaster.py

index 374fe35cd33d25f1e441a505f9d599ae5e4aeb79..a66a5f8c5e639ed60237122a4f9989d32da3a9a3 100644 (file)
--- a/yt_dlp/extractor/webcaster.py
+++ b/yt_dlp/extractor/webcaster.py
@@ -64,27 +64,23 @@ def _real_extract(self, url):
  
  class WebcasterFeedIE(InfoExtractor):
      _VALID_URL = r'https?://bl\.webcaster\.pro/feed/start/free_(?P<id>[^/]+)'
+    _EMBED_REGEX = [r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)']
      _TEST = {
          'url': 'http://bl.webcaster.pro/feed/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104',
          'only_matching': True,
      }
  
-    @staticmethod
-    def _extract_url(ie, webpage):
-        mobj = re.search(
-            r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)',
-            webpage)
-        if mobj:
-            return mobj.group('url')
+    def _extract_from_webpage(self, url, webpage):
+        yield from super()._extract_from_webpage(url, webpage)
+
          for secure in (True, False):
-            video_url = ie._og_search_video_url(
-                webpage, secure=secure, default=None)
+            video_url = self._og_search_video_url(webpage, secure=secure, default=None)
              if video_url:
                  mobj = re.search(
                      r'config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_[^?&=]+)',
                      video_url)
                  if mobj:
-                    return mobj.group('url')
+                    yield self.url_result(mobj.group('url'), self)
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
diff --git a/yt_dlp/extractor/wimtv.py b/yt_dlp/extractor/wimtv.py

index 263844d72b743de5744cde44e72a690da504f00a..d27a348d99bc24bf29b3d0517187a85857df2349 100644 (file)
--- a/yt_dlp/extractor/wimtv.py
+++ b/yt_dlp/extractor/wimtv.py
@@ -1,5 +1,3 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
@@ -20,6 +18,7 @@ class WimTVIE(InfoExtractor):
          )
          (?P<type>vod|live|cast)[=/]
          (?P<id>%s).*?)''' % _UUID_RE
+    _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>{_VALID_URL})']
      _TESTS = [{
          # vod stream
          'url': 'https://platform.wim.tv/embed/?vod=db29fb32-bade-47b6-a3a6-cb69fe80267a',
@@ -54,14 +53,6 @@ class WimTVIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe[^>]+src=["\'](?P<url>%s)' % WimTVIE._VALID_URL,
-                webpage)]
-
      def _real_initialize(self):
          if not self._player:
              self._get_player_data()
diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py

index 3cbcb4aa0f0fae36ee6040db37fff547dc7dc2e7..438828624198c9247fb65c817af4d36e6ec7d0a7 100644 (file)
--- a/yt_dlp/extractor/wistia.py
+++ b/yt_dlp/extractor/wistia.py
@@ -5,8 +5,8 @@
      ExtractorError,
      float_or_none,
      int_or_none,
+    try_call,
      try_get,
-    unescapeHTML,
  )
  
  
@@ -117,7 +117,7 @@ def _extract_media(self, embed_config):
  
  class WistiaIE(WistiaBaseIE):
      _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX)
-
+    _EMBED_REGEX = [r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})']
      _TESTS = [{
          # with hls video
          'url': 'wistia:807fafadvk',
@@ -146,17 +146,10 @@ class WistiaIE(WistiaBaseIE):
      }]
  
      # https://wistia.com/support/embed-and-share/video-on-your-website
-    @staticmethod
-    def _extract_url(webpage):
-        urls = WistiaIE._extract_urls(webpage)
-        return urls[0] if urls else None
-
-    @staticmethod
-    def _extract_urls(webpage):
-        urls = []
-        for match in re.finditer(
-                r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):
-            urls.append(unescapeHTML(match.group('url')))
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        urls = list(super()._extract_embed_urls(url, webpage))
+
          for match in re.finditer(
                  r'''(?sx)
                      <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1
@@ -166,6 +159,20 @@ def _extract_urls(webpage):
              urls.append('wistia:%s' % match.group('id'))
          return urls
  
+    @classmethod
+    def _extract_from_webpage(cls, url, webpage):
+        from .teachable import TeachableIE
+
+        if list(TeachableIE._extract_embed_urls(url, webpage)):
+            return
+
+        for entry in super()._extract_from_webpage(url, webpage):
+            yield {
+                **entry,
+                '_type': 'url_transparent',
+                'uploader': try_call(lambda: re.match(r'(?:https?://)?([^/]+)/', url).group(1)),
+            }
+
      def _real_extract(self, url):
          video_id = self._match_id(url)
          embed_config = self._download_embed_config('media', video_id, url)
diff --git a/yt_dlp/extractor/xfileshare.py b/yt_dlp/extractor/xfileshare.py

index 63abe4a1f4e8e6387bf83519ad04b2f4802a708b..5ecd7f00f0d69ef7f4bc7c86c425d109e22ce497 100644 (file)
--- a/yt_dlp/extractor/xfileshare.py
+++ b/yt_dlp/extractor/xfileshare.py
@@ -61,6 +61,7 @@ class XFileShareIE(InfoExtractor):
      IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
      _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
                    % '|'.join(site for site in list(zip(*_SITES))[0]))
+    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])]
  
      _FILE_NOT_FOUND_REGEXES = (
          r'>(?:404 - )?File Not Found<',
@@ -84,15 +85,6 @@ class XFileShareIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1'
-                % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]),
-                webpage)]
-
      def _real_extract(self, url):
          host, video_id = self._match_valid_url(url).groups()
  
diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py

index e42eed7d8e28bb3a4878d07c13df0c850b353358..688c6b952272954359e530ff2eecd2ce2e0a168b 100644 (file)
--- a/yt_dlp/extractor/xhamster.py
+++ b/yt_dlp/extractor/xhamster.py
@@ -373,6 +373,7 @@ def get_height(s):
  
  class XHamsterEmbedIE(InfoExtractor):
      _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % XHamsterIE._DOMAINS
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1']
      _TEST = {
          'url': 'http://xhamster.com/xembed.php?video=3328539',
          'info_dict': {
@@ -387,12 +388,6 @@ class XHamsterEmbedIE(InfoExtractor):
          }
      }
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [url for _, url in re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
-            webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py

index f85990e0a586e121268ee0af69201b0a6cf8c3dc..01a859556a49282672c146a9bbcae9e164fb850c 100644 (file)
--- a/yt_dlp/extractor/yahoo.py
+++ b/yt_dlp/extractor/yahoo.py
@@ -21,6 +21,8 @@
  class YahooIE(InfoExtractor):
      IE_DESC = 'Yahoo screen and movies'
      _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1']
+
      _TESTS = [{
          'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
          'info_dict': {
@@ -310,7 +312,7 @@ def _real_extract(self, url):
  
              if items.get('markup'):
                  entries.extend(
-                    self.url_result(yt_url) for yt_url in YoutubeIE._extract_urls(items['markup']))
+                    self.url_result(yt_url) for yt_url in YoutubeIE._extract_embed_urls(url, items['markup']))
  
              return self.playlist_result(
                  entries, item.get('uuid'),
diff --git a/yt_dlp/extractor/yapfiles.py b/yt_dlp/extractor/yapfiles.py

index 8fabdf81c95fba6e39bd039b4aa348a61414ece3..221df842cb084d219959cbc99af874dec6902cc6 100644 (file)
--- a/yt_dlp/extractor/yapfiles.py
+++ b/yt_dlp/extractor/yapfiles.py
@@ -1,11 +1,8 @@
-import re
-
  from .common import InfoExtractor
  from ..utils import (
      ExtractorError,
      int_or_none,
      qualities,
-    unescapeHTML,
      url_or_none,
  )
  
@@ -13,6 +10,7 @@
  class YapFilesIE(InfoExtractor):
      _YAPFILES_URL = r'//(?:(?:www|api)\.)?yapfiles\.ru/get_player/*\?.*?\bv=(?P<id>\w+)'
      _VALID_URL = r'https?:%s' % _YAPFILES_URL
+    _EMBED_REGEX = [rf'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?{_YAPFILES_URL}.*?)\1']
      _TESTS = [{
          # with hd
          'url': 'http://www.yapfiles.ru/get_player/?v=vMDE1NjcyNDUt0413',
@@ -30,12 +28,6 @@ class YapFilesIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [unescapeHTML(mobj.group('url')) for mobj in re.finditer(
-            r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.*?)\1'
-            % YapFilesIE._YAPFILES_URL, webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py

index b484e08ece573b1bff1dbcd703881f1e5af2b08b..7fdb865f7828a342c49eb63fd151c318d1225a28 100644 (file)
--- a/yt_dlp/extractor/youporn.py
+++ b/yt_dlp/extractor/youporn.py
@@ -12,6 +12,7 @@
  
  class YouPornIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)']
      _TESTS = [{
          'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
          'md5': '3744d24c50438cf5b6f6d59feb5055c2',
@@ -65,12 +66,6 @@ class YouPornIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)',
-            webpage)
-
      def _real_extract(self, url):
          mobj = self._match_valid_url(url)
          video_id = mobj.group('id')
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 4dc8e79ac1ba22576cb46293052ba88dad835778..f20b7321ad7c7cec7350fe39443a0ad058a6365d 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -929,6 +929,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                       (?:\#|$)""" % {
          'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES),
      }
+    _EMBED_REGEX = [r'''(?x)
+        (?:
+            <iframe[^>]+?src=|
+            data-video-url=|
+            <embed[^>]+?src=|
+            embedSWF\(?:\s*|
+            <object[^>]+data=|
+            new\s+SWFObject\(
+        )
+        (["\'])
+            (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
+            (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
+        \1''']
      _PLAYER_INFO_RE = (
          r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
          r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
@@ -2721,42 +2734,29 @@ def _mark_watched(self, video_id, player_responses):
                  url, video_id, f'Marking {label}watched',
                  'Unable to mark watched', fatal=False)
  
-    @staticmethod
-    def _extract_urls(webpage):
-        # Embedded YouTube player
-        entries = [
-            unescapeHTML(mobj.group('url'))
-            for mobj in re.finditer(r'''(?x)
-            (?:
-                <iframe[^>]+?src=|
-                data-video-url=|
-                <embed[^>]+?src=|
-                embedSWF\(?:\s*|
-                <object[^>]+data=|
-                new\s+SWFObject\(
-            )
-            (["\'])
-                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
-                (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
-            \1''', webpage)]
+    @classmethod
+    def _extract_from_webpage(cls, url, webpage):
+        # Invidious Instances
+        # https://github.com/yt-dlp/yt-dlp/issues/195
+        # https://github.com/iv-org/invidious/pull/1730
+        mobj = re.search(
+            r'<link rel="alternate" href="(?P<url>https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"',
+            webpage)
+        if mobj:
+            yield cls.url_result(mobj.group('url'), cls)
+            raise cls.StopExtraction()
+
+        yield from super()._extract_from_webpage(url, webpage)
  
          # lazyYT YouTube embed
-        entries.extend(list(map(
-            unescapeHTML,
-            re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
+        for id_ in re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage):
+            yield cls.url_result(unescapeHTML(id_), cls, id_)
  
          # Wordpress "YouTube Video Importer" plugin
-        matches = re.findall(r'''(?x)<div[^>]+
-            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
-            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
-        entries.extend(m[-1] for m in matches)
-
-        return entries
-
-    @staticmethod
-    def _extract_url(webpage):
-        urls = YoutubeIE._extract_urls(webpage)
-        return urls[0] if urls else None
+        for m in re.findall(r'''(?x)<div[^>]+
+                class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+                data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage):
+            yield cls.url_result(m[-1], cls, m[-1])
  
      @classmethod
      def extract_id(cls, url):
diff --git a/yt_dlp/extractor/zapiks.py b/yt_dlp/extractor/zapiks.py

index a1546fd88f6207bafcd9a8b42a4c3719642f5699..4b18cb86c8f9a98a2920c03a0a78cd3c29e61a67 100644 (file)
--- a/yt_dlp/extractor/zapiks.py
+++ b/yt_dlp/extractor/zapiks.py
@@ -12,6 +12,7 @@
  
  class ZapiksIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))'
+    _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"']
      _TESTS = [
          {
              'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html',
diff --git a/yt_dlp/extractor/zype.py b/yt_dlp/extractor/zype.py

index 6f2fbb9e9d6e59d648134fae87967f7073ca6d3f..a705149e62343940ee3372da936a79bf6bc553b7 100644 (file)
--- a/yt_dlp/extractor/zype.py
+++ b/yt_dlp/extractor/zype.py
@@ -15,6 +15,7 @@ class ZypeIE(InfoExtractor):
      _ID_RE = r'[\da-fA-F]+'
      _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)='
      _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE))
+    _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?{_COMMON_RE % _ID_RE}.+?)\1']
      _TEST = {
          'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false',
          'md5': 'eaee31d474c76a955bdaba02a505c595',
@@ -29,14 +30,6 @@ class ZypeIE(InfoExtractor):
          },
      }
  
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE),
-                webpage)]
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
author	pukkandan <redacted>
	Mon, 1 Aug 2022 01:23:25 +0000 (06:53 +0530)
committer	pukkandan <redacted>
	Mon, 1 Aug 2022 19:38:16 +0000 (01:08 +0530)
yt_dlp/extractor/_extractors.py		patch \| blob \| blame \| history
yt_dlp/extractor/adobetv.py		patch \| blob \| blame \| history
yt_dlp/extractor/ant1newsgr.py		patch \| blob \| blame \| history
yt_dlp/extractor/anvato.py		patch \| blob \| blame \| history
yt_dlp/extractor/apa.py		patch \| blob \| blame \| history
yt_dlp/extractor/aparat.py		patch \| blob \| blame \| history
yt_dlp/extractor/arcpublishing.py		patch \| blob \| blame \| history
yt_dlp/extractor/arkena.py		patch \| blob \| blame \| history
yt_dlp/extractor/arte.py		patch \| blob \| blame \| history
yt_dlp/extractor/bandcamp.py		patch \| blob \| blame \| history
yt_dlp/extractor/bbc.py		patch \| blob \| blame \| history
yt_dlp/extractor/bitchute.py		patch \| blob \| blame \| history
yt_dlp/extractor/blogger.py		patch \| blob \| blame \| history
yt_dlp/extractor/buzzfeed.py		patch \| blob \| blame \| history
yt_dlp/extractor/channel9.py		patch \| blob \| blame \| history
yt_dlp/extractor/cinchcast.py		patch \| blob \| blame \| history
yt_dlp/extractor/cloudflarestream.py		patch \| blob \| blame \| history
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/condenast.py		patch \| blob \| blame \| history
yt_dlp/extractor/crooksandliars.py		patch \| blob \| blame \| history
yt_dlp/extractor/cspan.py		patch \| blob \| blame \| history
yt_dlp/extractor/dailymail.py		patch \| blob \| blame \| history
yt_dlp/extractor/dailymotion.py		patch \| blob \| blame \| history
yt_dlp/extractor/dbtv.py		patch \| blob \| blame \| history
yt_dlp/extractor/digiteka.py		patch \| blob \| blame \| history
yt_dlp/extractor/drtuber.py		patch \| blob \| blame \| history
yt_dlp/extractor/eagleplatform.py		patch \| blob \| blame \| history
yt_dlp/extractor/embedly.py		patch \| blob \| blame \| history
yt_dlp/extractor/ertgr.py		patch \| blob \| blame \| history
yt_dlp/extractor/expressen.py		patch \| blob \| blame \| history
yt_dlp/extractor/facebook.py		patch \| blob \| blame \| history
yt_dlp/extractor/foxnews.py		patch \| blob \| blame \| history
yt_dlp/extractor/francetv.py		patch \| blob \| blame \| history
yt_dlp/extractor/gedidigital.py		patch \| blob \| blame \| history
yt_dlp/extractor/generic.py		patch \| blob \| blame \| history
yt_dlp/extractor/gfycat.py		patch \| blob \| blame \| history
yt_dlp/extractor/glomex.py		patch \| blob \| blame \| history
yt_dlp/extractor/googledrive.py		patch \| blob \| blame \| history
yt_dlp/extractor/heise.py		patch \| blob \| blame \| history
yt_dlp/extractor/huffpost.py		patch \| blob \| blame \| history
yt_dlp/extractor/indavideo.py		patch \| blob \| blame \| history
yt_dlp/extractor/instagram.py		patch \| blob \| blame \| history
yt_dlp/extractor/ivi.py		patch \| blob \| blame \| history
yt_dlp/extractor/joj.py		patch \| blob \| blame \| history
yt_dlp/extractor/jwplatform.py		patch \| blob \| blame \| history
yt_dlp/extractor/kaltura.py		patch \| blob \| blame \| history
yt_dlp/extractor/kinja.py		patch \| blob \| blame \| history
yt_dlp/extractor/libsyn.py		patch \| blob \| blame \| history
yt_dlp/extractor/limelight.py		patch \| blob \| blame \| history
yt_dlp/extractor/livestream.py		patch \| blob \| blame \| history
yt_dlp/extractor/mainstreaming.py		patch \| blob \| blame \| history
yt_dlp/extractor/mangomolo.py		patch \| blob \| blame \| history
yt_dlp/extractor/medialaan.py		patch \| blob \| blame \| history
yt_dlp/extractor/mediaset.py		patch \| blob \| blame \| history
yt_dlp/extractor/mediasite.py		patch \| blob \| blame \| history
yt_dlp/extractor/megaphone.py		patch \| blob \| blame \| history
yt_dlp/extractor/megatvcom.py		patch \| blob \| blame \| history
yt_dlp/extractor/mlb.py		patch \| blob \| blame \| history
yt_dlp/extractor/mofosex.py		patch \| blob \| blame \| history
yt_dlp/extractor/mtv.py		patch \| blob \| blame \| history
yt_dlp/extractor/myvi.py		patch \| blob \| blame \| history
yt_dlp/extractor/nbc.py		patch \| blob \| blame \| history
yt_dlp/extractor/nexx.py		patch \| blob \| blame \| history
yt_dlp/extractor/nytimes.py		patch \| blob \| blame \| history
yt_dlp/extractor/odnoklassniki.py		patch \| blob \| blame \| history
yt_dlp/extractor/onionstudios.py		patch \| blob \| blame \| history
yt_dlp/extractor/ooyala.py		patch \| blob \| blame \| history
yt_dlp/extractor/panopto.py		patch \| blob \| blame \| history
yt_dlp/extractor/peertube.py		patch \| blob \| blame \| history
yt_dlp/extractor/periscope.py		patch \| blob \| blame \| history
yt_dlp/extractor/piksel.py		patch \| blob \| blame \| history
yt_dlp/extractor/pladform.py		patch \| blob \| blame \| history
yt_dlp/extractor/playwire.py		patch \| blob \| blame \| history
yt_dlp/extractor/pornhub.py		patch \| blob \| blame \| history
yt_dlp/extractor/rcs.py		patch \| blob \| blame \| history
yt_dlp/extractor/redtube.py		patch \| blob \| blame \| history
yt_dlp/extractor/rtlnl.py		patch \| blob \| blame \| history
yt_dlp/extractor/rumble.py		patch \| blob \| blame \| history
yt_dlp/extractor/rutube.py		patch \| blob \| blame \| history
yt_dlp/extractor/rutv.py		patch \| blob \| blame \| history
yt_dlp/extractor/ruutu.py		patch \| blob \| blame \| history
yt_dlp/extractor/sbs.py		patch \| blob \| blame \| history
yt_dlp/extractor/senategov.py		patch \| blob \| blame \| history
yt_dlp/extractor/sendtonews.py		patch \| blob \| blame \| history
yt_dlp/extractor/seznamzpravy.py		patch \| blob \| blame \| history
yt_dlp/extractor/sharevideos.py	[new file with mode: 0644]	patch \| blob
yt_dlp/extractor/simplecast.py		patch \| blob \| blame \| history
yt_dlp/extractor/soundcloud.py		patch \| blob \| blame \| history
yt_dlp/extractor/spankwire.py		patch \| blob \| blame \| history
yt_dlp/extractor/sportbox.py		patch \| blob \| blame \| history
yt_dlp/extractor/spotify.py		patch \| blob \| blame \| history
yt_dlp/extractor/springboardplatform.py		patch \| blob \| blame \| history
yt_dlp/extractor/streamable.py		patch \| blob \| blame \| history
yt_dlp/extractor/substack.py		patch \| blob \| blame \| history
yt_dlp/extractor/svt.py		patch \| blob \| blame \| history
yt_dlp/extractor/teachable.py		patch \| blob \| blame \| history
yt_dlp/extractor/ted.py		patch \| blob \| blame \| history
yt_dlp/extractor/theplatform.py		patch \| blob \| blame \| history
yt_dlp/extractor/threeqsdn.py		patch \| blob \| blame \| history
yt_dlp/extractor/tiktok.py		patch \| blob \| blame \| history
yt_dlp/extractor/tnaflix.py		patch \| blob \| blame \| history
yt_dlp/extractor/tube8.py		patch \| blob \| blame \| history
yt_dlp/extractor/tunein.py		patch \| blob \| blame \| history
yt_dlp/extractor/tvc.py		patch \| blob \| blame \| history
yt_dlp/extractor/tvigle.py		patch \| blob \| blame \| history
yt_dlp/extractor/tvopengr.py		patch \| blob \| blame \| history
yt_dlp/extractor/tvp.py		patch \| blob \| blame \| history
yt_dlp/extractor/twentymin.py		patch \| blob \| blame \| history
yt_dlp/extractor/udn.py		patch \| blob \| blame \| history
yt_dlp/extractor/ustream.py		patch \| blob \| blame \| history
yt_dlp/extractor/vbox7.py		patch \| blob \| blame \| history
yt_dlp/extractor/vevo.py		patch \| blob \| blame \| history
yt_dlp/extractor/vice.py		patch \| blob \| blame \| history
yt_dlp/extractor/viddler.py		patch \| blob \| blame \| history
yt_dlp/extractor/videa.py		patch \| blob \| blame \| history
yt_dlp/extractor/videomore.py		patch \| blob \| blame \| history
yt_dlp/extractor/videopress.py		patch \| blob \| blame \| history
yt_dlp/extractor/viewlift.py		patch \| blob \| blame \| history
yt_dlp/extractor/vimeo.py		patch \| blob \| blame \| history
yt_dlp/extractor/vine.py		patch \| blob \| blame \| history
yt_dlp/extractor/viqeo.py		patch \| blob \| blame \| history
yt_dlp/extractor/vk.py		patch \| blob \| blame \| history
yt_dlp/extractor/vodplatform.py		patch \| blob \| blame \| history
yt_dlp/extractor/voxmedia.py		patch \| blob \| blame \| history
yt_dlp/extractor/vshare.py		patch \| blob \| blame \| history
yt_dlp/extractor/vzaar.py		patch \| blob \| blame \| history
yt_dlp/extractor/washingtonpost.py		patch \| blob \| blame \| history
yt_dlp/extractor/webcaster.py		patch \| blob \| blame \| history
yt_dlp/extractor/wimtv.py		patch \| blob \| blame \| history
yt_dlp/extractor/wistia.py		patch \| blob \| blame \| history
yt_dlp/extractor/xfileshare.py		patch \| blob \| blame \| history
yt_dlp/extractor/xhamster.py		patch \| blob \| blame \| history
yt_dlp/extractor/yahoo.py		patch \| blob \| blame \| history
yt_dlp/extractor/yapfiles.py		patch \| blob \| blame \| history
yt_dlp/extractor/youporn.py		patch \| blob \| blame \| history
yt_dlp/extractor/youtube.py		patch \| blob \| blame \| history
yt_dlp/extractor/zapiks.py		patch \| blob \| blame \| history
yt_dlp/extractor/zype.py		patch \| blob \| blame \| history