[ie] Migrate commonly plural fields to lists (#8917)

[yt-dlp.git] / yt_dlp / extractor / bbc.py
diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py

index 8231557300866ff071b7219ffda0be9186334daf..015af9e1d616c9b71a1ae0eb6f3b15490b6218fd 100644 (file)
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@@ -1,19 +1,12 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import functools
  import itertools
  import json
  import re
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_etree_Element,
-    compat_HTTPError,
-    compat_str,
-    compat_urllib_error,
-    compat_urlparse,
-)
+from ..compat import compat_str, compat_urlparse
+from ..networking.exceptions import HTTPError
  from ..utils import (
      ExtractorError,
      OnDemandPagedList,
@@ -22,11 +15,13 @@
      float_or_none,
      get_element_by_class,
      int_or_none,
+    join_nonempty,
      js_to_json,
      parse_duration,
      parse_iso8601,
      parse_qs,
      strip_or_none,
+    traverse_obj,
      try_get,
      unescapeHTML,
      unified_timestamp,
@@ -48,11 +43,11 @@ class BBCCoUkIE(InfoExtractor):
                              iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
                              music/(?:clips|audiovideo/popular)[/#]|
                              radio/player/|
-                            sounds/play/|
                              events/[^/]+/play/[^/]+/
                          )
                          (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                      ''' % _ID_REGEX
+    _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  
      _LOGIN_URL = 'https://account.bbc.com/signin'
      _NETRC_MACHINE = 'bbc'
@@ -224,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
                  # rtmp download
                  'skip_download': True,
              },
-        }, {
-            'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
-            'note': 'Audio',
-            'info_dict': {
-                'id': 'm0007jz9',
-                'ext': 'mp4',
-                'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
-                'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
-                'duration': 9840,
-            },
-            'params': {
-                # rtmp download
-                'skip_download': True,
-            }
          }, {
              'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
              'only_matching': True,
@@ -283,7 +264,7 @@ def _perform_login(self, username, password):
              post_url, None, 'Logging in', data=urlencode_postdata(login_form),
              headers={'Referer': self._LOGIN_URL})
  
-        if self._LOGIN_URL in urlh.geturl():
+        if self._LOGIN_URL in urlh.url:
              error = clean_html(get_element_by_class('form-message', response))
              if error:
                  raise ExtractorError(
@@ -318,7 +299,7 @@ def _get_subtitles(self, media, programme_id):
                  continue
              captions = self._download_xml(
                  cc_url, programme_id, 'Downloading captions', fatal=False)
-            if not isinstance(captions, compat_etree_Element):
+            if not isinstance(captions, xml.etree.ElementTree.Element):
                  continue
              subtitles['en'] = [
                  {
@@ -336,16 +317,25 @@ def _raise_extractor_error(self, media_selection_error):
  
      def _download_media_selector(self, programme_id):
          last_exception = None
+        formats, subtitles = [], {}
          for media_set in self._MEDIA_SETS:
              try:
-                return self._download_media_selector_url(
+                fmts, subs = self._download_media_selector_url(
                      self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
+                formats.extend(fmts)
+                if subs:
+                    self._merge_subtitles(subs, target=subtitles)
              except BBCCoUkIE.MediaSelectionError as e:
                  if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
                      last_exception = e
                      continue
                  self._raise_extractor_error(e)
-        self._raise_extractor_error(last_exception)
+        if last_exception:
+            if formats or subtitles:
+                self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
+            else:
+                self._raise_extractor_error(last_exception)
+        return formats, subtitles
  
      def _download_media_selector_url(self, url, programme_id=None):
          media_selection = self._download_json(
@@ -394,8 +384,8 @@ def _process_media_selector(self, media_selection, programme_id):
                                  href, programme_id, ext='mp4', entry_protocol='m3u8_native',
                                  m3u8_id=format_id, fatal=False)
                          except ExtractorError as e:
-                            if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
-                                    and e.exc_info[1].code in (403, 404)):
+                            if not (isinstance(e.exc_info[1], HTTPError)
+                                    and e.exc_info[1].status in (403, 404)):
                                  raise
                              fmts = []
                          formats.extend(fmts)
@@ -478,7 +468,7 @@ def _download_playlist(self, playlist_id):
  
              return programme_id, title, description, duration, formats, subtitles
          except ExtractorError as ee:
-            if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+            if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
                  raise
  
          # fallback to legacy playlist
@@ -581,8 +571,6 @@ def _real_extract(self, url):
          else:
              programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  
-        self._sort_formats(formats)
-
          return {
              'id': programme_id,
              'title': title,
@@ -594,10 +582,15 @@ def _real_extract(self, url):
          }
  
  
-class BBCIE(BBCCoUkIE):
+class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
      IE_NAME = 'bbc'
      IE_DESC = 'BBC'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?(?:
+            bbc\.(?:com|co\.uk)|
+            bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
+            bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
+        )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
  
      _MEDIA_SETS = [
          'pc',
@@ -847,6 +840,26 @@ class BBCIE(BBCCoUkIE):
              'upload_date': '20190604',
              'categories': ['Psychology'],
          },
+    }, {
+        # BBC Sounds
+        'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
+        'info_dict': {
+            'id': 'm001q789',
+            'ext': 'mp4',
+            'title': 'The Night Tracks Mix - Music for the darkling hour',
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
+            'chapters': 'count:8',
+            'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
+            'uploader': 'Radio 3',
+            'duration': 1800,
+            'uploader_id': 'bbc_radio_three',
+        },
+    }, {  # onion routes
+        'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
+        'only_matching': True,
      }]
  
      @classmethod
@@ -885,7 +898,6 @@ def _extract_from_media_meta(self, media_meta, video_id):
      def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
          programme_id, title, description, duration, formats, subtitles = \
              self._process_legacy_playlist_url(url, playlist_id)
-        self._sort_formats(formats)
          return {
              'id': programme_id,
              'title': title,
@@ -904,13 +916,8 @@ def _real_extract(self, url):
          json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
          timestamp = json_ld_info.get('timestamp')
  
-        playlist_title = json_ld_info.get('title')
-        if not playlist_title:
-            playlist_title = self._og_search_title(
-                webpage, default=None) or self._html_search_regex(
-                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
-            if playlist_title:
-                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+        playlist_title = json_ld_info.get('title') or re.sub(
+            r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
  
          playlist_description = json_ld_info.get(
              'description') or self._og_search_description(webpage, default=None)
@@ -954,7 +961,6 @@ def _real_extract(self, url):
                              duration = int_or_none(items[0].get('duration'))
                              programme_id = items[0].get('vpid')
                              formats, subtitles = self._download_media_selector(programme_id)
-                            self._sort_formats(formats)
                              entries.append({
                                  'id': programme_id,
                                  'title': title,
@@ -987,11 +993,10 @@ def _real_extract(self, url):
                                      # Some playlist URL may fail with 500, at the same time
                                      # the other one may work fine (e.g.
                                      # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
-                                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                                    if isinstance(e.cause, HTTPError) and e.cause.status == 500:
                                          continue
                                      raise
                              if entry:
-                                self._sort_formats(entry['formats'])
                                  entries.append(entry)
  
          if entries:
@@ -1015,7 +1020,6 @@ def _real_extract(self, url):
  
          if programme_id:
              formats, subtitles = self._download_media_selector(programme_id)
-            self._sort_formats(formats)
              # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
              digital_data = self._parse_json(
                  self._search_regex(
@@ -1047,7 +1051,6 @@ def _real_extract(self, url):
              if version_id:
                  title = smp_data['title']
                  formats, subtitles = self._download_media_selector(version_id)
-                self._sort_formats(formats)
                  image_url = smp_data.get('holdingImageURL')
                  display_date = init_data.get('displayDate')
                  topic_title = init_data.get('topicTitle')
@@ -1089,7 +1092,6 @@ def _real_extract(self, url):
                      continue
                  title = lead_media.get('title') or self._og_search_title(webpage)
                  formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                  description = lead_media.get('summary')
                  uploader = lead_media.get('masterBrand')
                  uploader_id = lead_media.get('mid')
@@ -1118,7 +1120,6 @@ def _real_extract(self, url):
              if current_programme and programme_id and current_programme.get('type') == 'playable_item':
                  title = current_programme.get('titles', {}).get('tertiary') or playlist_title
                  formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                  synopses = current_programme.get('synopses') or {}
                  network = current_programme.get('network') or {}
                  duration = int_or_none(
@@ -1137,6 +1138,13 @@ def _real_extract(self, url):
                      'uploader_id': network.get('id'),
                      'formats': formats,
                      'subtitles': subtitles,
+                    'chapters': traverse_obj(preload_state, (
+                        'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
+                            'title': ('titles', {lambda x: join_nonempty(
+                                'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
+                            'start_time': ('offset', 'start', {float_or_none}),
+                            'end_time': ('offset', 'end', {float_or_none}),
+                        })) or None,
                  }
  
          bbc3_config = self._parse_json(
@@ -1151,7 +1159,6 @@ def _real_extract(self, url):
              clip_title = clip.get('title')
              if clip_vpid and clip_title:
                  formats, subtitles = self._download_media_selector(clip_vpid)
-                self._sort_formats(formats)
                  return {
                      'id': clip_vpid,
                      'title': clip_title,
@@ -1173,7 +1180,6 @@ def _real_extract(self, url):
                      if not programme_id:
                          continue
                      formats, subtitles = self._download_media_selector(programme_id)
-                    self._sort_formats(formats)
                      entries.append({
                          'id': programme_id,
                          'title': playlist_title,
@@ -1191,7 +1197,7 @@ def _real_extract(self, url):
          if initial_data is None:
              initial_data = self._search_regex(
                  r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
-                'preload state', default={})
+                'preload state', default='{}')
          else:
              initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
          initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
@@ -1205,7 +1211,6 @@ def parse_media(media):
                      if not (item_id and item_title):
                          continue
                      formats, subtitles = self._download_media_selector(item_id)
-                    self._sort_formats(formats)
                      item_desc = None
                      blocks = try_get(media, lambda x: x['summary']['blocks'], list)
                      if blocks:
@@ -1239,7 +1244,7 @@ def parse_media(media):
                                            (lambda x: x['data']['blocks'],
                                             lambda x: x['data']['content']['model']['blocks'],),
                                            list) or []):
-                        if block.get('type') != 'media':
+                        if block.get('type') not in ['media', 'video']:
                              continue
                          parse_media(block.get('model'))
              return self.playlist_result(
@@ -1306,7 +1311,6 @@ def extract_all(pattern):
              formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
              if not formats and not self.get_param('ignore_no_formats'):
                  continue
-            self._sort_formats(formats)
  
              video_id = media_meta.get('externalId')
              if not video_id: