[ie/vidly] Add extractor (#8612)

[yt-dlp.git] / yt_dlp / extractor / mtv.py
diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py

index 510f1439e5a7e618ff619a9a770db892f6edae03..e192453c728bb3368d527019fe11453af0e111e0 100644 (file)
--- a/yt_dlp/extractor/mtv.py
+++ b/yt_dlp/extractor/mtv.py
@@ -1,22 +1,17 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import re
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_str,
-    compat_xpath,
-)
+from ..compat import compat_str
+from ..networking import HEADRequest, Request
  from ..utils import (
      ExtractorError,
+    RegexNotFoundError,
      find_xpath_attr,
      fix_xml_ampersands,
      float_or_none,
-    HEADRequest,
      int_or_none,
-    RegexNotFoundError,
-    sanitized_Request,
+    join_nonempty,
      strip_or_none,
      timeconvert,
      try_get,
@@ -44,7 +39,7 @@ def _remove_template_parameter(url):
          # Remove the templates, like &device={device}
          return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url)
  
-    def _get_feed_url(self, uri):
+    def _get_feed_url(self, uri, url=None):
          return self._FEED_URL
  
      def _get_thumbnail_url(self, uri, itemdoc):
@@ -56,15 +51,15 @@ def _get_thumbnail_url(self, uri, itemdoc):
  
      def _extract_mobile_video_formats(self, mtvn_id):
          webpage_url = self._MOBILE_TEMPLATE % mtvn_id
-        req = sanitized_Request(webpage_url)
+        req = Request(webpage_url)
          # Otherwise we get a webpage that would execute some javascript
-        req.add_header('User-Agent', 'curl/7')
+        req.headers['User-Agent'] = 'curl/7'
          webpage = self._download_webpage(req, mtvn_id,
                                           'Downloading mobile page')
          metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
          req = HEADRequest(metrics_url)
          response = self._request_webpage(req, mtvn_id, 'Resolving url')
-        url = response.geturl()
+        url = response.url
          # Transform the url to get the best quality:
          url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
          return [{'url': url, 'ext': 'mp4'}]
@@ -99,16 +94,14 @@ def _extract_video_formats(self, mdoc, mtvn_id, video_id):
                      formats.extend([{
                          'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext,
                          'url': rtmp_video_url,
-                        'format_id': '-'.join(filter(None, [
+                        'format_id': join_nonempty(
                              'rtmp' if rtmp_video_url.startswith('rtmp') else None,
-                            rendition.get('bitrate')])),
+                            rendition.get('bitrate')),
                          'width': int(rendition.get('width')),
                          'height': int(rendition.get('height')),
                      }])
                  except (KeyError, TypeError):
                      raise ExtractorError('Invalid rendition field.')
-        if formats:
-            self._sort_formats(formats)
          return formats
  
      def _extract_subtitles(self, mdoc, mtvn_id):
@@ -145,7 +138,7 @@ def _get_video_info(self, itemdoc, use_hls=True):
          mediagen_doc = self._download_xml(
              mediagen_url, video_id, 'Downloading video urls', fatal=False)
  
-        if mediagen_doc is False:
+        if not isinstance(mediagen_doc, xml.etree.ElementTree.Element):
              return None
  
          item = mediagen_doc.find('./video/item')
@@ -166,9 +159,9 @@ def _get_video_info(self, itemdoc, use_hls=True):
                  itemdoc, './/{http://search.yahoo.com/mrss/}category',
                  'scheme', 'urn:mtvn:video_title')
          if title_el is None:
-            title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title'))
+            title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
          if title_el is None:
-            title_el = itemdoc.find(compat_xpath('.//title'))
+            title_el = itemdoc.find('.//title')
              if title_el.text is None:
                  title_el = None
  
@@ -207,8 +200,6 @@ def _get_video_info(self, itemdoc, use_hls=True):
          if not formats:
              return None
  
-        self._sort_formats(formats)
-
          return {
              'title': title,
              'formats': formats,
@@ -229,9 +220,9 @@ def _get_feed_query(self, uri):
              data['lang'] = self._LANG
          return data
  
-    def _get_videos_info(self, uri, use_hls=True):
+    def _get_videos_info(self, uri, use_hls=True, url=None):
          video_id = self._id_from_uri(uri)
-        feed_url = self._get_feed_url(uri)
+        feed_url = self._get_feed_url(uri, url)
          info_url = update_url_query(feed_url, self._get_feed_query(uri))
          return self._get_videos_info_from_url(info_url, video_id, use_hls)
  
@@ -249,6 +240,7 @@ def _get_videos_info_from_url(self, url, video_id, use_hls=True):
              if info:
                  entries.append(info)
  
+        # TODO: should be multi-video
          return self.playlist_result(
              entries, playlist_title=title, playlist_description=description)
  
@@ -310,7 +302,17 @@ def _extract_mgid(self, webpage):
              main_container = self._extract_child_with_type(data, 'MainContainer')
              ab_testing = self._extract_child_with_type(main_container, 'ABTesting')
              video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
-            mgid = video_player['props']['media']['video']['config']['uri']
+            if video_player:
+                mgid = try_get(video_player, lambda x: x['props']['media']['video']['config']['uri'])
+            else:
+                flex_wrapper = self._extract_child_with_type(ab_testing or main_container, 'FlexWrapper')
+                auth_suite_wrapper = self._extract_child_with_type(flex_wrapper, 'AuthSuiteWrapper')
+                player = self._extract_child_with_type(auth_suite_wrapper or flex_wrapper, 'Player')
+                if player:
+                    mgid = try_get(player, lambda x: x['props']['videoDetail']['mgid'])
+
+        if not mgid:
+            raise ExtractorError('Could not extract mgid')
  
          return mgid
  
@@ -318,13 +320,14 @@ def _real_extract(self, url):
          title = url_basename(url)
          webpage = self._download_webpage(url, title)
          mgid = self._extract_mgid(webpage)
-        videos_info = self._get_videos_info(mgid)
+        videos_info = self._get_videos_info(mgid, url=url)
          return videos_info
  
  
  class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
      IE_NAME = 'mtvservices:embedded'
      _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1']
  
      _TEST = {
          # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
@@ -340,21 +343,14 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
          },
      }
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
-    def _get_feed_url(self, uri):
+    def _get_feed_url(self, uri, url=None):
          video_id = self._id_from_uri(uri)
          config = self._download_json(
              'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id)
          return self._remove_template_parameter(config['feedWithQueryParams'])
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          mgid = mobj.group('mgid')
          return self._get_videos_info(mgid)
  
@@ -436,7 +432,7 @@ def _get_thumbnail_url(self, uri, itemdoc):
          return 'http://mtv.mtvnimages.com/uri/' + uri
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          video_id = mobj.group('videoid')
          uri = mobj.groupdict().get('mgid')
          if uri is None:
@@ -536,7 +532,7 @@ def _get_feed_query(self, uri):
          }
  
  
-class MTVItaliaProgrammaIE(MTVItaliaIE):
+class MTVItaliaProgrammaIE(MTVItaliaIE):  # XXX: Do not subclass from concrete IE
      IE_NAME = 'mtv.it:programma'
      _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:programmi|playlist)/(?P<id>[0-9a-z]+)'
      _TESTS = [{