[extractor] Simplify search extractors

[yt-dlp.git] / yt_dlp / extractor / bbc.py
diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py

index e8d000bbb513a36fb8fb0bdfe66bd1611c039e67..4e2dcd76b896e0731aa51aab357233ad8533283c 100644 (file)
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@@ -10,8 +10,7 @@
  from ..compat import (
      compat_etree_Element,
      compat_HTTPError,
-    compat_parse_qs,
-    compat_urllib_parse_urlparse,
+    compat_str,
      compat_urlparse,
  )
  from ..utils import (
@@ -25,8 +24,11 @@
      js_to_json,
      parse_duration,
      parse_iso8601,
+    parse_qs,
+    strip_or_none,
      try_get,
      unescapeHTML,
+    unified_timestamp,
      url_or_none,
      urlencode_postdata,
      urljoin,
@@ -586,8 +588,8 @@ class BBCIE(BBCCoUkIE):
      _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
  
      _MEDIA_SETS = [
-        'mobile-tablet-main',
          'pc',
+        'mobile-tablet-main',
      ]
  
      _TESTS = [{
@@ -761,8 +763,17 @@ class BBCIE(BBCCoUkIE):
          'only_matching': True,
      }, {
          # custom redirection to www.bbc.com
+        # also, video with window.__INITIAL_DATA__
          'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
-        'only_matching': True,
+        'info_dict': {
+            'id': 'p02xzws1',
+            'ext': 'mp4',
+            'title': "Pluto may have 'nitrogen glaciers'",
+            'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1437785037,
+            'upload_date': '20150725',
+        },
      }, {
          # single video article embedded with data-media-vpid
          'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
@@ -1164,12 +1175,29 @@ def parse_media(media):
                          continue
                      formats, subtitles = self._download_media_selector(item_id)
                      self._sort_formats(formats)
+                    item_desc = None
+                    blocks = try_get(media, lambda x: x['summary']['blocks'], list)
+                    if blocks:
+                        summary = []
+                        for block in blocks:
+                            text = try_get(block, lambda x: x['model']['text'], compat_str)
+                            if text:
+                                summary.append(text)
+                        if summary:
+                            item_desc = '\n\n'.join(summary)
+                    item_time = None
+                    for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
+                        if try_get(meta, lambda x: x['label']) == 'Published':
+                            item_time = unified_timestamp(meta.get('timestamp'))
+                            break
                      entries.append({
                          'id': item_id,
                          'title': item_title,
                          'thumbnail': item.get('holdingImageUrl'),
                          'formats': formats,
                          'subtitles': subtitles,
+                        'timestamp': item_time,
+                        'description': strip_or_none(item_desc),
                      })
              for resp in (initial_data.get('data') or {}).values():
                  name = resp.get('name')
@@ -1242,7 +1270,7 @@ def extract_all(pattern):
          entries = []
          for num, media_meta in enumerate(medias, start=1):
              formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
-            if not formats:
+            if not formats and not self.get_param('ignore_no_formats'):
                  continue
              self._sort_formats(formats)
  
@@ -1381,7 +1409,7 @@ def _fetch_page(self, programme_id, per_page, series_id, page):
  
      def _real_extract(self, url):
          pid = self._match_id(url)
-        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        qs = parse_qs(url)
          series_id = qs.get('seriesId', [None])[0]
          page = qs.get('page', [None])[0]
          per_page = 36 if page else self._PAGE_SIZE