]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/theplatform.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / theplatform.py
index c56b708b8208b5a08c0c87755c9a8ad3741c1f36..c8026d2941340b67c94ea3c2393dc99c2aa87594 100644 (file)
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import re
 import time
 import hmac
 
 from .once import OnceIE
 from .adobepass import AdobePassIE
-from ..compat import (
-    compat_parse_qs,
-    compat_urllib_parse_urlparse,
-)
 from ..utils import (
     determine_ext,
     ExtractorError,
     float_or_none,
     int_or_none,
+    parse_qs,
     sanitized_Request,
     unsmuggle_url,
     update_url_query,
@@ -129,6 +123,13 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
            (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
          |theplatform:)(?P<id>[^/\?&]+)'''
+    _EMBED_REGEX = [
+        r'''(?x)
+            <meta\s+
+                property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''',
+        r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1'
+    ]
 
     _TESTS = [{
         # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
@@ -198,22 +199,11 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
     }]
 
     @classmethod
-    def _extract_urls(cls, webpage):
-        m = re.search(
-            r'''(?x)
-                    <meta\s+
-                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
-                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
-            ''', webpage)
-        if m:
-            return [m.group('url')]
-
+    def _extract_embed_urls(cls, url, webpage):
         # Are whitespaces ignored in URLs?
         # https://github.com/ytdl-org/youtube-dl/issues/12044
-        matches = re.findall(
-            r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
-        if matches:
-            return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield re.sub(r'\s', '', embed_url)
 
     @staticmethod
     def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
@@ -250,7 +240,7 @@ def _real_extract(self, url):
             path += mobj.group('media')
         path += video_id
 
-        qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        qs_dict = parse_qs(url)
         if 'guid' in qs_dict:
             webpage = self._download_webpage(url, video_id)
             scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
@@ -359,7 +349,7 @@ def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custo
             if first_video_id is None:
                 first_video_id = cur_video_id
                 duration = float_or_none(item.get('plfile$duration'))
-            file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes']
+            file_asset_types = item.get('plfile$assetTypes') or parse_qs(smil_url)['assetTypes']
             for asset_type in file_asset_types:
                 if asset_type in asset_types:
                     continue