[ie/orf:on] Improve extraction (#9677)

[yt-dlp.git] / yt_dlp / extractor / mtv.py
diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py

index 3ef851e0bb91973a9020934277b02b2ce1593463..404e431bc6a6b8342c9c815e39855eedc98a7561 100644 (file)
--- a/yt_dlp/extractor/mtv.py
+++ b/yt_dlp/extractor/mtv.py
@@ -1,19 +1,17 @@
  import re
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_str,
-)
+from ..compat import compat_str
+from ..networking import HEADRequest, Request
  from ..utils import (
      ExtractorError,
+    RegexNotFoundError,
      find_xpath_attr,
      fix_xml_ampersands,
      float_or_none,
-    HEADRequest,
      int_or_none,
      join_nonempty,
-    RegexNotFoundError,
-    sanitized_Request,
      strip_or_none,
      timeconvert,
      try_get,
@@ -53,15 +51,15 @@ def _get_thumbnail_url(self, uri, itemdoc):
  
      def _extract_mobile_video_formats(self, mtvn_id):
          webpage_url = self._MOBILE_TEMPLATE % mtvn_id
-        req = sanitized_Request(webpage_url)
+        req = Request(webpage_url)
          # Otherwise we get a webpage that would execute some javascript
-        req.add_header('User-Agent', 'curl/7')
+        req.headers['User-Agent'] = 'curl/7'
          webpage = self._download_webpage(req, mtvn_id,
                                           'Downloading mobile page')
          metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
          req = HEADRequest(metrics_url)
          response = self._request_webpage(req, mtvn_id, 'Resolving url')
-        url = response.geturl()
+        url = response.url
          # Transform the url to get the best quality:
          url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
          return [{'url': url, 'ext': 'mp4'}]
@@ -104,8 +102,6 @@ def _extract_video_formats(self, mdoc, mtvn_id, video_id):
                      }])
                  except (KeyError, TypeError):
                      raise ExtractorError('Invalid rendition field.')
-        if formats:
-            self._sort_formats(formats)
          return formats
  
      def _extract_subtitles(self, mdoc, mtvn_id):
@@ -142,7 +138,7 @@ def _get_video_info(self, itemdoc, use_hls=True):
          mediagen_doc = self._download_xml(
              mediagen_url, video_id, 'Downloading video urls', fatal=False)
  
-        if mediagen_doc is False:
+        if not isinstance(mediagen_doc, xml.etree.ElementTree.Element):
              return None
  
          item = mediagen_doc.find('./video/item')
@@ -204,8 +200,6 @@ def _get_video_info(self, itemdoc, use_hls=True):
          if not formats:
              return None
  
-        self._sort_formats(formats)
-
          return {
              'title': title,
              'formats': formats,
@@ -333,6 +327,7 @@ def _real_extract(self, url):
  class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
      IE_NAME = 'mtvservices:embedded'
      _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1']
  
      _TEST = {
          # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
@@ -348,13 +343,6 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
          },
      }
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
-
      def _get_feed_url(self, uri, url=None):
          video_id = self._id_from_uri(uri)
          config = self._download_json(
@@ -463,6 +451,7 @@ def _real_extract(self, url):
  
  
  class MTVDEIE(MTVServicesInfoExtractor):
+    _WORKING = False
      IE_NAME = 'mtv.de'
      _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)'
      _TESTS = [{
@@ -544,7 +533,7 @@ def _get_feed_query(self, uri):
          }
  
  
-class MTVItaliaProgrammaIE(MTVItaliaIE):
+class MTVItaliaProgrammaIE(MTVItaliaIE):  # XXX: Do not subclass from concrete IE
      IE_NAME = 'mtv.it:programma'
      _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:programmi|playlist)/(?P<id>[0-9a-z]+)'
      _TESTS = [{