]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/mtv.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / mtv.py
index 4cf178b04a9296508e7125f156b0d8c01a8e125f..10cd304eb0b011ab02376e2fdd424e48441e9ed5 100644 (file)
@@ -1,13 +1,7 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_str,
-    compat_xpath,
-)
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     find_xpath_attr,
@@ -15,6 +9,7 @@
     float_or_none,
     HEADRequest,
     int_or_none,
+    join_nonempty,
     RegexNotFoundError,
     sanitized_Request,
     strip_or_none,
@@ -44,7 +39,7 @@ def _remove_template_parameter(url):
         # Remove the templates, like &device={device}
         return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url)
 
-    def _get_feed_url(self, uri):
+    def _get_feed_url(self, uri, url=None):
         return self._FEED_URL
 
     def _get_thumbnail_url(self, uri, itemdoc):
@@ -99,9 +94,9 @@ def _extract_video_formats(self, mdoc, mtvn_id, video_id):
                     formats.extend([{
                         'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext,
                         'url': rtmp_video_url,
-                        'format_id': '-'.join(filter(None, [
+                        'format_id': join_nonempty(
                             'rtmp' if rtmp_video_url.startswith('rtmp') else None,
-                            rendition.get('bitrate')])),
+                            rendition.get('bitrate')),
                         'width': int(rendition.get('width')),
                         'height': int(rendition.get('height')),
                     }])
@@ -166,9 +161,9 @@ def _get_video_info(self, itemdoc, use_hls=True):
                 itemdoc, './/{http://search.yahoo.com/mrss/}category',
                 'scheme', 'urn:mtvn:video_title')
         if title_el is None:
-            title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title'))
+            title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
         if title_el is None:
-            title_el = itemdoc.find(compat_xpath('.//title'))
+            title_el = itemdoc.find('.//title')
             if title_el.text is None:
                 title_el = None
 
@@ -229,9 +224,9 @@ def _get_feed_query(self, uri):
             data['lang'] = self._LANG
         return data
 
-    def _get_videos_info(self, uri, use_hls=True):
+    def _get_videos_info(self, uri, use_hls=True, url=None):
         video_id = self._id_from_uri(uri)
-        feed_url = self._get_feed_url(uri)
+        feed_url = self._get_feed_url(uri, url)
         info_url = update_url_query(feed_url, self._get_feed_query(uri))
         return self._get_videos_info_from_url(info_url, video_id, use_hls)
 
@@ -249,6 +244,7 @@ def _get_videos_info_from_url(self, url, video_id, use_hls=True):
             if info:
                 entries.append(info)
 
+        # TODO: should be multi-video
         return self.playlist_result(
             entries, playlist_title=title, playlist_description=description)
 
@@ -275,7 +271,9 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
 
     @staticmethod
     def _extract_child_with_type(parent, t):
-        return next(c for c in parent['children'] if c.get('type') == t)
+        for c in parent['children']:
+            if c.get('type') == t:
+                return c
 
     def _extract_mgid(self, webpage):
         try:
@@ -306,8 +304,19 @@ def _extract_mgid(self, webpage):
             data = self._parse_json(self._search_regex(
                 r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
             main_container = self._extract_child_with_type(data, 'MainContainer')
-            video_player = self._extract_child_with_type(main_container, 'VideoPlayer')
-            mgid = video_player['props']['media']['video']['config']['uri']
+            ab_testing = self._extract_child_with_type(main_container, 'ABTesting')
+            video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
+            if video_player:
+                mgid = try_get(video_player, lambda x: x['props']['media']['video']['config']['uri'])
+            else:
+                flex_wrapper = self._extract_child_with_type(ab_testing or main_container, 'FlexWrapper')
+                auth_suite_wrapper = self._extract_child_with_type(flex_wrapper, 'AuthSuiteWrapper')
+                player = self._extract_child_with_type(auth_suite_wrapper or flex_wrapper, 'Player')
+                if player:
+                    mgid = try_get(player, lambda x: x['props']['videoDetail']['mgid'])
+
+        if not mgid:
+            raise ExtractorError('Could not extract mgid')
 
         return mgid
 
@@ -315,13 +324,14 @@ def _real_extract(self, url):
         title = url_basename(url)
         webpage = self._download_webpage(url, title)
         mgid = self._extract_mgid(webpage)
-        videos_info = self._get_videos_info(mgid)
+        videos_info = self._get_videos_info(mgid, url=url)
         return videos_info
 
 
 class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
     IE_NAME = 'mtvservices:embedded'
     _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1']
 
     _TEST = {
         # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
@@ -337,21 +347,14 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
         },
     }
 
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
-    def _get_feed_url(self, uri):
+    def _get_feed_url(self, uri, url=None):
         video_id = self._id_from_uri(uri)
         config = self._download_json(
             'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id)
         return self._remove_template_parameter(config['feedWithQueryParams'])
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
         mgid = mobj.group('mgid')
         return self._get_videos_info(mgid)
 
@@ -433,7 +436,7 @@ def _get_thumbnail_url(self, uri, itemdoc):
         return 'http://mtv.mtvnimages.com/uri/' + uri
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
         video_id = mobj.group('videoid')
         uri = mobj.groupdict().get('mgid')
         if uri is None: