[extractor] Framework for embed detection (#4307)

[yt-dlp.git] / yt_dlp / extractor / brightcove.py
diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py

index 9f643a9e7567b5b38e475f97c6601a5c85f0b1f2..99a216fb49818dfd74010a3bf23551201bac1b13 100644 (file)
--- a/yt_dlp/extractor/brightcove.py
+++ b/yt_dlp/extractor/brightcove.py
@@ -1,9 +1,7 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import base64
  import re
  import struct
+import xml.etree.ElementTree
  
  from .adobepass import AdobePassIE
  from .common import InfoExtractor
@@ -11,12 +9,11 @@
      compat_etree_fromstring,
      compat_HTTPError,
      compat_parse_qs,
-    compat_urllib_parse_urlparse,
      compat_urlparse,
-    compat_xml_parse_error,
  )
  from ..utils import (
      clean_html,
+    dict_get,
      extract_attributes,
      ExtractorError,
      find_xpath_attr,
@@ -26,6 +23,7 @@
      js_to_json,
      mimetype2ext,
      parse_iso8601,
+    parse_qs,
      smuggle_url,
      str_or_none,
      try_get,
@@ -165,7 +163,7 @@ def _build_brightcove_url(cls, object_str):
  
          try:
              object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
-        except compat_xml_parse_error:
+        except xml.etree.ElementTree.ParseError:
              return
  
          fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
@@ -177,7 +175,7 @@ def _build_brightcove_url(cls, object_str):
              flashvars = {}
  
          data_url = object_doc.attrib.get('data', '')
-        data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query)
+        data_url_params = parse_qs(data_url)
  
          def find_param(name):
              if name in flashvars:
@@ -290,7 +288,7 @@ def _real_extract(self, url):
          url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url)
          # Change bckey (used by bcove.me urls) to playerKey
          url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
          query_str = mobj.group('query')
          query = compat_urlparse.parse_qs(query_str)
  
@@ -404,11 +402,11 @@ class BrightcoveNewIE(AdobePassIE):
  
      @staticmethod
      def _extract_url(ie, webpage):
-        urls = BrightcoveNewIE._extract_urls(ie, webpage)
+        urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
          return urls[0] if urls else None
  
      @staticmethod
-    def _extract_urls(ie, webpage):
+    def _extract_brightcove_urls(ie, webpage):
          # Reference:
          # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
          # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
@@ -471,29 +469,23 @@ def _extract_urls(ie, webpage):
      def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
          title = json_data['name'].strip()
  
-        num_drm_sources = 0
-        formats = []
+        formats, subtitles = [], {}
          sources = json_data.get('sources') or []
          for source in sources:
              container = source.get('container')
              ext = mimetype2ext(source.get('type'))
              src = source.get('src')
-            skip_unplayable = not self.get_param('allow_unplayable_formats')
-            # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
-            if skip_unplayable and (container == 'WVM' or source.get('key_systems')):
-                num_drm_sources += 1
-                continue
-            elif ext == 'ism' and skip_unplayable:
-                continue
-            elif ext == 'm3u8' or container == 'M2TS':
+            if ext == 'm3u8' or container == 'M2TS':
                  if not src:
                      continue
-                formats.extend(self._extract_m3u8_formats(
-                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+                subtitles = self._merge_subtitles(subtitles, subs)
              elif ext == 'mpd':
                  if not src:
                      continue
-                formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))
+                fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False)
+                subtitles = self._merge_subtitles(subtitles, subs)
              else:
                  streaming_src = source.get('streaming_src')
                  stream_name, app_name = source.get('stream_name'), source.get('app_name')
@@ -539,7 +531,13 @@ def build_format_id(kind):
                          'play_path': stream_name,
                          'format_id': build_format_id('rtmp'),
                      })
-                formats.append(f)
+                fmts = [f]
+
+            # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
+            if container == 'WVM' or source.get('key_systems') or ext == 'ism':
+                for f in fmts:
+                    f['has_drm'] = True
+            formats.extend(fmts)
  
          if not formats:
              errors = json_data.get('errors')
@@ -547,16 +545,12 @@ def build_format_id(kind):
                  error = errors[0]
                  self.raise_no_formats(
                      error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
-            elif (not self.get_param('allow_unplayable_formats')
-                    and sources and num_drm_sources == len(sources)):
-                raise ExtractorError('This video is DRM protected.', expected=True)
  
          self._sort_formats(formats)
  
          for f in formats:
              f.setdefault('http_headers', {}).update(headers)
  
-        subtitles = {}
          for text_track in json_data.get('text_tracks', []):
              if text_track.get('kind') != 'captions':
                  continue
@@ -574,11 +568,19 @@ def build_format_id(kind):
          if duration is not None and duration <= 0:
              is_live = True
  
+        common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)]
+        thumb_base_url = dict_get(json_data, ('poster', 'thumbnail'))
+        thumbnails = [{
+            'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url),
+            'width': w,
+            'height': h,
+        } for w, h in common_res] if thumb_base_url else None
+
          return {
              'id': video_id,
-            'title': self._live_title(title) if is_live else title,
+            'title': title,
              'description': clean_html(json_data.get('description')),
-            'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),
+            'thumbnails': thumbnails,
              'duration': duration,
              'timestamp': parse_iso8601(json_data.get('published_at')),
              'uploader_id': json_data.get('account_id'),
@@ -595,12 +597,12 @@ def _real_extract(self, url):
              'ip_blocks': smuggled_data.get('geo_ip_blocks'),
          })
  
-        account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()
+        account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups()
  
          policy_key_id = '%s_%s' % (account_id, player_id)
-        policy_key = self._downloader.cache.load('brightcove', policy_key_id)
+        policy_key = self.cache.load('brightcove', policy_key_id)
          policy_key_extracted = False
-        store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)
+        store_pk = lambda x: self.cache.store('brightcove', policy_key_id, x)
  
          def extract_policy_key():
              base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed)