[ie/crunchyroll] Fix stream extraction (#10005)

[yt-dlp.git] / yt_dlp / extractor / theplatform.py
diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py

index bf7efc0134da41f8fad6a999b68741dc98d5f89e..9160f5ec6b8e95932c86383f09dcee9aa784fba2 100644 (file)
--- a/yt_dlp/extractor/theplatform.py
+++ b/yt_dlp/extractor/theplatform.py
@@ -7,19 +7,23 @@
  
  from .once import OnceIE
  from .adobepass import AdobePassIE
+from ..networking import Request
  from ..utils import (
      determine_ext,
      ExtractorError,
      float_or_none,
      int_or_none,
      parse_qs,
-    sanitized_Request,
      unsmuggle_url,
      update_url_query,
      xpath_with_ns,
      mimetype2ext,
      find_xpath_attr,
+    traverse_obj,
+    update_url,
+    urlhandle_detect_ext,
  )
+from ..networking import HEADRequest
  
  default_ns = 'http://www.w3.org/2005/SMIL21/Language'
  _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
@@ -45,7 +49,7 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d
                      raise ExtractorError(
                          error_element.attrib['abstract'], expected=True)
  
-        smil_formats = self._parse_smil_formats(
+        smil_formats, subtitles = self._parse_smil_formats_and_subtitles(
              meta, smil_url, video_id, namespace=default_ns,
              # the parameters are from syfy.com, other sites may use others,
              # they also work for nbc.com
@@ -65,8 +69,6 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d
  
                  formats.append(_format)
  
-        subtitles = self._parse_smil_subtitles(meta, default_ns)
-
          return formats, subtitles
  
      def _download_theplatform_metadata(self, path, video_id):
@@ -102,6 +104,10 @@ def _add_chapter(start_time, end_time):
                  _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
              _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
  
+        def extract_site_specific_field(field):
+            # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
+            return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
+
          return {
              'title': info['title'],
              'subtitles': subtitles,
@@ -111,6 +117,14 @@ def _add_chapter(start_time, end_time):
              'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
              'uploader': info.get('billingCode'),
              'chapters': chapters,
+            'creator': traverse_obj(info, ('author', {str})) or None,
+            'categories': traverse_obj(info, (
+                'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
+            'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
+            'location': extract_site_specific_field('region'),
+            'series': extract_site_specific_field('show'),
+            'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
+            'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
          }
  
      def _extract_theplatform_metadata(self, path, video_id):
@@ -123,6 +137,13 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
          (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
             (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
           |theplatform:)(?P<id>[^/\?&]+)'''
+    _EMBED_REGEX = [
+        r'''(?x)
+            <meta\s+
+                property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''',
+        r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1'
+    ]
  
      _TESTS = [{
          # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
@@ -157,7 +178,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
          'params': {
              # rtmp download
              'skip_download': True,
-        }
+        },
+        'skip': 'CNet no longer uses ThePlatform',
      }, {
          'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
          'info_dict': {
@@ -166,7 +188,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
              'description': 'md5:644ad9188d655b742f942bf2e06b002d',
              'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
              'uploader': 'EGSM',
-        }
+        },
+        'skip': 'Dead link',
      }, {
          'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
          'only_matching': True,
@@ -184,6 +207,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
              'upload_date': '20150701',
              'uploader': 'NBCU-NEWS',
          },
+        'skip': 'Error: Player PID "nbcNewsOffsite" is disabled',
      }, {
          # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
          # geo-restricted (US), HLS encrypted with AES-128
@@ -192,22 +216,11 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
      }]
  
      @classmethod
-    def _extract_urls(cls, webpage):
-        m = re.search(
-            r'''(?x)
-                    <meta\s+
-                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
-                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
-            ''', webpage)
-        if m:
-            return [m.group('url')]
-
+    def _extract_embed_urls(cls, url, webpage):
          # Are whitespaces ignored in URLs?
          # https://github.com/ytdl-org/youtube-dl/issues/12044
-        matches = re.findall(
-            r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
-        if matches:
-            return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield re.sub(r'\s', '', embed_url)
  
      @staticmethod
      def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
@@ -274,7 +287,7 @@ def _real_extract(self, url):
              source_url = smuggled_data.get('source_url')
              if source_url:
                  headers['Referer'] = source_url
-            request = sanitized_Request(url, headers=headers)
+            request = Request(url, headers=headers)
              webpage = self._download_webpage(request, video_id)
              smil_url = self._search_regex(
                  r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
@@ -300,7 +313,17 @@ def _real_extract(self, url):
              smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
  
          formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
-        self._sort_formats(formats)
+
+        # With some sites, manifest URL must be forced to extract HLS formats
+        if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')):
+            m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None)
+            urlh = self._request_webpage(
+                HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False)
+            if urlh and urlhandle_detect_ext(urlh) == 'm3u8':
+                m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+                    m3u8_url, video_id, m3u8_id='hls', fatal=False)
+                formats.extend(m3u8_fmts)
+                self._merge_subtitles(m3u8_subs, target=subtitles)
  
          ret = self._extract_theplatform_metadata(path, video_id)
          combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
@@ -370,8 +393,6 @@ def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custo
                  formats.extend(cur_formats)
                  subtitles = self._merge_subtitles(subtitles, cur_subtitles)
  
-        self._sort_formats(formats)
-
          thumbnails = [{
              'url': thumbnail['plfile$url'],
              'width': int_or_none(thumbnail.get('plfile$width')),