[ie/matchtv] Fix extractor (#10190)

[yt-dlp.git] / yt_dlp / extractor / bbc.py
diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py

index 4e2dcd76b896e0731aa51aab357233ad8533283c..3af923f9584d9c5734dc81d8d4b70cba8c3e3c81 100644 (file)
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@@ -1,18 +1,12 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import functools
  import itertools
  import json
  import re
+import urllib.parse
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_etree_Element,
-    compat_HTTPError,
-    compat_str,
-    compat_urlparse,
-)
+from ..networking.exceptions import HTTPError
  from ..utils import (
      ExtractorError,
      OnDemandPagedList,
@@ -21,11 +15,13 @@
      float_or_none,
      get_element_by_class,
      int_or_none,
+    join_nonempty,
      js_to_json,
      parse_duration,
      parse_iso8601,
      parse_qs,
      strip_or_none,
+    traverse_obj,
      try_get,
      unescapeHTML,
      unified_timestamp,
@@ -38,8 +34,8 @@
  class BBCCoUkIE(InfoExtractor):
      IE_NAME = 'bbc.co.uk'
      IE_DESC = 'BBC iPlayer'
-    _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
-    _VALID_URL = r'''(?x)
+    _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
+    _VALID_URL = rf'''(?x)
                      https?://
                          (?:www\.)?bbc\.co\.uk/
                          (?:
@@ -47,11 +43,11 @@ class BBCCoUkIE(InfoExtractor):
                              iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
                              music/(?:clips|audiovideo/popular)[/#]|
                              radio/player/|
-                            sounds/play/|
                              events/[^/]+/play/[^/]+/
                          )
-                        (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
-                    ''' % _ID_REGEX
+                        (?P<id>{_ID_REGEX})(?!/(?:episodes|broadcasts|clips))
+                    '''
+    _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  
      _LOGIN_URL = 'https://account.bbc.com/signin'
      _NETRC_MACHINE = 'bbc'
@@ -79,7 +75,7 @@ class BBCCoUkIE(InfoExtractor):
              'params': {
                  # rtmp download
                  'skip_download': True,
-            }
+            },
          },
          {
              'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
@@ -152,7 +148,7 @@ class BBCCoUkIE(InfoExtractor):
              'params': {
                  # rtmp download
                  'skip_download': True,
-            }
+            },
          }, {
              'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
              'note': 'Video',
@@ -166,7 +162,7 @@ class BBCCoUkIE(InfoExtractor):
              'params': {
                  # rtmp download
                  'skip_download': True,
-            }
+            },
          }, {
              'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
              'info_dict': {
@@ -223,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
                  # rtmp download
                  'skip_download': True,
              },
-        }, {
-            'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
-            'note': 'Audio',
-            'info_dict': {
-                'id': 'm0007jz9',
-                'ext': 'mp4',
-                'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
-                'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
-                'duration': 9840,
-            },
-            'params': {
-                # rtmp download
-                'skip_download': True,
-            }
          }, {
              'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
              'only_matching': True,
@@ -263,11 +245,7 @@ class BBCCoUkIE(InfoExtractor):
              'only_matching': True,
          }]
  
-    def _login(self):
-        username, password = self._get_login_info()
-        if username is None:
-            return
-
+    def _perform_login(self, username, password):
          login_page = self._download_webpage(
              self._LOGIN_URL, None, 'Downloading signin page')
  
@@ -286,26 +264,23 @@ def _login(self):
              post_url, None, 'Logging in', data=urlencode_postdata(login_form),
              headers={'Referer': self._LOGIN_URL})
  
-        if self._LOGIN_URL in urlh.geturl():
+        if self._LOGIN_URL in urlh.url:
              error = clean_html(get_element_by_class('form-message', response))
              if error:
                  raise ExtractorError(
-                    'Unable to login: %s' % error, expected=True)
+                    f'Unable to login: {error}', expected=True)
              raise ExtractorError('Unable to log in')
  
-    def _real_initialize(self):
-        self._login()
-
      class MediaSelectionError(Exception):
-        def __init__(self, id):
-            self.id = id
+        def __init__(self, error_id):
+            self.id = error_id
  
      def _extract_asx_playlist(self, connection, programme_id):
          asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
          return [ref.get('href') for ref in asx.findall('./Entry/ref')]
  
      def _extract_items(self, playlist):
-        return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
+        return playlist.findall(f'./{{{self._EMP_PLAYLIST_NS}}}item')
  
      def _extract_medias(self, media_selection):
          error = media_selection.get('result')
@@ -324,7 +299,7 @@ def _get_subtitles(self, media, programme_id):
                  continue
              captions = self._download_xml(
                  cc_url, programme_id, 'Downloading captions', fatal=False)
-            if not isinstance(captions, compat_etree_Element):
+            if not isinstance(captions, xml.etree.ElementTree.Element):
                  continue
              subtitles['en'] = [
                  {
@@ -337,21 +312,30 @@ def _get_subtitles(self, media, programme_id):
  
      def _raise_extractor_error(self, media_selection_error):
          raise ExtractorError(
-            '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+            f'{self.IE_NAME} returned error: {media_selection_error.id}',
              expected=True)
  
      def _download_media_selector(self, programme_id):
          last_exception = None
+        formats, subtitles = [], {}
          for media_set in self._MEDIA_SETS:
              try:
-                return self._download_media_selector_url(
+                fmts, subs = self._download_media_selector_url(
                      self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
+                formats.extend(fmts)
+                if subs:
+                    self._merge_subtitles(subs, target=subtitles)
              except BBCCoUkIE.MediaSelectionError as e:
                  if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
                      last_exception = e
                      continue
                  self._raise_extractor_error(e)
-        self._raise_extractor_error(last_exception)
+        if last_exception:
+            if formats or subtitles:
+                self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
+            else:
+                self._raise_extractor_error(last_exception)
+        return formats, subtitles
  
      def _download_media_selector_url(self, url, programme_id=None):
          media_selection = self._download_json(
@@ -388,21 +372,29 @@ def _process_media_selector(self, media_selection, programme_id):
                          for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
                              formats.append({
                                  'url': ref,
-                                'format_id': 'ref%s_%s' % (i, format_id),
+                                'format_id': f'ref{i}_{format_id}',
                              })
                      elif transfer_format == 'dash':
                          formats.extend(self._extract_mpd_formats(
                              href, programme_id, mpd_id=format_id, fatal=False))
                      elif transfer_format == 'hls':
-                        formats.extend(self._extract_m3u8_formats(
-                            href, programme_id, ext='mp4', entry_protocol='m3u8_native',
-                            m3u8_id=format_id, fatal=False))
+                        # TODO: let expected_status be passed into _extract_xxx_formats() instead
+                        try:
+                            fmts = self._extract_m3u8_formats(
+                                href, programme_id, ext='mp4', entry_protocol='m3u8_native',
+                                m3u8_id=format_id, fatal=False)
+                        except ExtractorError as e:
+                            if not (isinstance(e.exc_info[1], HTTPError)
+                                    and e.exc_info[1].status in (403, 404)):
+                                raise
+                            fmts = []
+                        formats.extend(fmts)
                      elif transfer_format == 'hds':
                          formats.extend(self._extract_f4m_formats(
                              href, programme_id, f4m_id=format_id, fatal=False))
                      else:
                          if not supplier and bitrate:
-                            format_id += '-%d' % bitrate
+                            format_id += f'-{bitrate}'
                          fmt = {
                              'format_id': format_id,
                              'filesize': file_size,
@@ -431,9 +423,9 @@ def _process_media_selector(self, media_selection, programme_id):
                              identifier = connection.get('identifier')
                              server = connection.get('server')
                              fmt.update({
-                                'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
+                                'url': f'{protocol}://{server}/{application}?{auth_string}',
                                  'play_path': identifier,
-                                'app': '%s?%s' % (application, auth_string),
+                                'app': f'{application}?{auth_string}',
                                  'page_url': 'http://www.bbc.co.uk',
                                  'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
                                  'rtmp_live': False,
@@ -449,11 +441,12 @@ def _process_media_selector(self, media_selection, programme_id):
      def _download_playlist(self, playlist_id):
          try:
              playlist = self._download_json(
-                'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
+                f'http://www.bbc.co.uk/programmes/{playlist_id}/playlist.json',
                  playlist_id, 'Downloading playlist JSON')
+            formats = []
+            subtitles = {}
  
-            version = playlist.get('defaultAvailableVersion')
-            if version:
+            for version in playlist.get('allAvailableVersions', []):
                  smp_config = version['smpConfig']
                  title = smp_config['title']
                  description = smp_config['summary']
@@ -463,10 +456,19 @@ def _download_playlist(self, playlist_id):
                          continue
                      programme_id = item.get('vpid')
                      duration = int_or_none(item.get('duration'))
-                    formats, subtitles = self._download_media_selector(programme_id)
-                return programme_id, title, description, duration, formats, subtitles
+                    version_formats, version_subtitles = self._download_media_selector(programme_id)
+                    types = version['types']
+                    for f in version_formats:
+                        f['format_note'] = ', '.join(types)
+                        if any('AudioDescribed' in x for x in types):
+                            f['language_preference'] = -10
+                    formats += version_formats
+                    for tag, subformats in (version_subtitles or {}).items():
+                        subtitles.setdefault(tag, []).extend(subformats)
+
+            return programme_id, title, description, duration, formats, subtitles
          except ExtractorError as ee:
-            if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+            if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
                  raise
  
          # fallback to legacy playlist
@@ -478,32 +480,32 @@ def _process_legacy_playlist_url(self, url, display_id):
  
      def _process_legacy_playlist(self, playlist_id):
          return self._process_legacy_playlist_url(
-            'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
+            f'http://www.bbc.co.uk/iplayer/playlist/{playlist_id}', playlist_id)
  
      def _download_legacy_playlist_url(self, url, playlist_id=None):
          return self._download_xml(
              url, playlist_id, 'Downloading legacy playlist XML')
  
      def _extract_from_legacy_playlist(self, playlist, playlist_id):
-        no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
+        no_items = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}noItems')
          if no_items is not None:
              reason = no_items.get('reason')
              if reason == 'preAvailability':
-                msg = 'Episode %s is not yet available' % playlist_id
+                msg = f'Episode {playlist_id} is not yet available'
              elif reason == 'postAvailability':
-                msg = 'Episode %s is no longer available' % playlist_id
+                msg = f'Episode {playlist_id} is no longer available'
              elif reason == 'noMedia':
-                msg = 'Episode %s is not currently available' % playlist_id
+                msg = f'Episode {playlist_id} is not currently available'
              else:
-                msg = 'Episode %s is not available: %s' % (playlist_id, reason)
+                msg = f'Episode {playlist_id} is not available: {reason}'
              raise ExtractorError(msg, expected=True)
  
          for item in self._extract_items(playlist):
              kind = item.get('kind')
              if kind not in ('programme', 'radioProgramme'):
                  continue
-            title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
-            description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
+            title = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}title').text
+            description_el = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}summary')
              description = description_el.text if description_el is not None else None
  
              def get_programme_id(item):
@@ -513,7 +515,7 @@ def get_from_attributes(item):
                          if value and re.match(r'^[pb][\da-z]{7}$', value):
                              return value
                  get_from_attributes(item)
-                mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
+                mediator = item.find(f'./{{{self._EMP_PLAYLIST_NS}}}mediator')
                  if mediator is not None:
                      return get_from_attributes(mediator)
  
@@ -553,7 +555,7 @@ def _real_extract(self, url):
  
          if not programme_id:
              programme_id = self._search_regex(
-                r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
+                rf'"vpid"\s*:\s*"({self._ID_REGEX})"', webpage, 'vpid', fatal=False, default=None)
  
          if programme_id:
              formats, subtitles = self._download_media_selector(programme_id)
@@ -569,8 +571,6 @@ def _real_extract(self, url):
          else:
              programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  
-        self._sort_formats(formats)
-
          return {
              'id': programme_id,
              'title': title,
@@ -582,10 +582,15 @@ def _real_extract(self, url):
          }
  
  
-class BBCIE(BBCCoUkIE):
+class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
      IE_NAME = 'bbc'
      IE_DESC = 'BBC'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?(?:
+            bbc\.(?:com|co\.uk)|
+            bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
+            bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
+        )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
  
      _MEDIA_SETS = [
          'pc',
@@ -597,7 +602,7 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.com/news/world-europe-32668511',
          'info_dict': {
              'id': 'world-europe-32668511',
-            'title': 'Russia stages massive WW2 parade',
+            'title': 'Russia stages massive WW2 parade despite Western boycott',
              'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
          },
          'playlist_count': 2,
@@ -618,6 +623,7 @@ class BBCIE(BBCCoUkIE):
          'info_dict': {
              'id': '3662a707-0af9-3149-963f-47bea720b460',
              'title': 'BUGGER',
+            'description': r're:BUGGER  The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
          },
          'playlist_count': 18,
      }, {
@@ -626,16 +632,16 @@ class BBCIE(BBCCoUkIE):
          'info_dict': {
              'id': 'p02mprgb',
              'ext': 'mp4',
-            'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
-            'description': 'md5:2868290467291b37feda7863f7a83f54',
+            'title': 'Germanwings crash site aerial video',
+            'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
              'duration': 47,
              'timestamp': 1427219242,
              'upload_date': '20150324',
+            'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
          },
          'params': {
-            # rtmp download
              'skip_download': True,
-        }
+        },
      }, {
          # article with single video embedded with data-playable containing XML playlist
          # with direct video links as progressiveDownloadUrl (for now these are extracted)
@@ -651,21 +657,24 @@ class BBCIE(BBCCoUkIE):
          },
          'params': {
              'skip_download': True,
-        }
+        },
+        'skip': 'now SIMORGH_DATA with no video',
      }, {
          # single video embedded with data-playable containing XML playlists (regional section)
          'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
          'info_dict': {
-            'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
+            'id': '39275083',
+            'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
              'ext': 'mp4',
              'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
-            'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
+            'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
              'timestamp': 1434713142,
              'upload_date': '20150619',
+            'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
          },
          'params': {
              'skip_download': True,
-        }
+        },
      }, {
          # single video from video playlist embedded with vxp-playlist-data JSON
          'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@@ -678,22 +687,21 @@ class BBCIE(BBCCoUkIE):
          },
          'params': {
              'skip_download': True,
-        }
+        },
+        'skip': '404 Not Found',
      }, {
-        # single video story with digitalData
+        # single video story with __PWA_PRELOADED_STATE__
          'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
          'info_dict': {
              'id': 'p02q6gc4',
-            'ext': 'flv',
-            'title': 'Sri Lanka’s spicy secret',
-            'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
-            'timestamp': 1437674293,
-            'upload_date': '20150723',
+            'ext': 'mp4',
+            'title': 'Tasting the spice of life in Jaffna',
+            'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
+            'timestamp': 1646058397,
+            'upload_date': '20220228',
+            'duration': 255,
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
          },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
      }, {
          # single video story without digitalData
          'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
@@ -705,12 +713,10 @@ class BBCIE(BBCCoUkIE):
              'timestamp': 1415867444,
              'upload_date': '20141113',
          },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
+        'skip': 'redirects to TopGear home page',
      }, {
          # single video embedded with Morph
+        # TODO: replacement test page
          'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
          'info_dict': {
              'id': 'p041vhd0',
@@ -721,27 +727,22 @@ class BBCIE(BBCCoUkIE):
              'uploader': 'BBC Sport',
              'uploader_id': 'bbc_sport',
          },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
-        'skip': 'Georestricted to UK',
+        'skip': 'Video no longer in page',
      }, {
-        # single video with playlist.sxml URL in playlist param
+        # single video in __INITIAL_DATA__
          'url': 'http://www.bbc.com/sport/0/football/33653409',
          'info_dict': {
              'id': 'p02xycnp',
              'ext': 'mp4',
-            'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
-            'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
+            'title': 'Ronaldo to Man Utd, Arsenal to spend?',
+            'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
+            'timestamp': 1437750175,
+            'upload_date': '20150724',
+            'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
              'duration': 140,
          },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
      }, {
-        # article with multiple videos embedded with playlist.sxml in playlist param
+        # article with multiple videos embedded with Morph.setPayload
          'url': 'http://www.bbc.com/sport/0/football/34475836',
          'info_dict': {
              'id': '34475836',
@@ -749,6 +750,21 @@ class BBCIE(BBCCoUkIE):
              'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
          },
          'playlist_count': 3,
+    }, {
+        # Testing noplaylist
+        'url': 'http://www.bbc.com/sport/0/football/34475836',
+        'info_dict': {
+            'id': 'p034ppnv',
+            'ext': 'mp4',
+            'title': 'All you need to know about Jurgen Klopp',
+            'timestamp': 1444335081,
+            'upload_date': '20151008',
+            'duration': 122.0,
+            'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
+        },
+        'params': {
+            'noplaylist': True,
+        },
      }, {
          # school report article with single video
          'url': 'http://www.bbc.co.uk/schoolreport/35744779',
@@ -757,6 +773,7 @@ class BBCIE(BBCCoUkIE):
              'title': 'School which breaks down barriers in Jerusalem',
          },
          'playlist_count': 1,
+        'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
      }, {
          # single video with playlist URL from weather section
          'url': 'http://www.bbc.com/weather/features/33601775',
@@ -773,22 +790,49 @@ class BBCIE(BBCCoUkIE):
              'thumbnail': r're:https?://.+/.+\.jpg',
              'timestamp': 1437785037,
              'upload_date': '20150725',
+            'duration': 105,
+        },
+    }, {
+        # video with window.__INITIAL_DATA__ and value as JSON string
+        'url': 'https://www.bbc.com/news/av/world-europe-59468682',
+        'info_dict': {
+            'id': 'p0b779gc',
+            'ext': 'mp4',
+            'title': 'Why France is making this woman a national hero',
+            'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1638215626,
+            'upload_date': '20211129',
+            'duration': 125,
+        },
+    }, {
+        # video with script id __NEXT_DATA__ and value as JSON string
+        'url': 'https://www.bbc.com/news/uk-68546268',
+        'info_dict': {
+            'id': 'p0hj0lq7',
+            'ext': 'mp4',
+            'title': 'Nasser Hospital doctor describes his treatment by IDF',
+            'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1710188248,
+            'upload_date': '20240311',
+            'duration': 104,
          },
      }, {
          # single video article embedded with data-media-vpid
          'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
          'only_matching': True,
      }, {
+        # bbcthreeConfig
          'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
          'info_dict': {
              'id': 'p06556y7',
              'ext': 'mp4',
-            'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
-            'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
+            'title': 'Things Not To Say to people that live on council estates',
+            'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
+            'duration': 360,
+            'thumbnail': r're:https?://.+/.+\.jpg',
          },
-        'params': {
-            'skip_download': True,
-        }
      }, {
          # window.__PRELOADED_STATE__
          'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
@@ -800,6 +844,7 @@ class BBCIE(BBCCoUkIE):
              'uploader': 'Radio 3',
              'uploader_id': 'bbc_radio_three',
          },
+        'skip': '404 Not Found',
      }, {
          'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
          'info_dict': {
@@ -807,6 +852,7 @@ class BBCIE(BBCCoUkIE):
              'ext': 'mp4',
              'title': 'md5:2fabf12a726603193a2879a055f72514',
              'description': 'Learn English words and phrases from this story',
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
          },
          'add_ie': [BBCCoUkIE.ie_key()],
      }, {
@@ -815,21 +861,43 @@ class BBCIE(BBCCoUkIE):
          'info_dict': {
              'id': 'p07c6sb9',
              'ext': 'mp4',
-            'title': 'How positive thinking is harming your happiness',
-            'alt_title': 'The downsides of positive thinking',
-            'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+            'title': 'The downsides of positive thinking',
+            'description': 'The downsides of positive thinking',
              'duration': 235,
-            'thumbnail': r're:https?://.+/p07c9dsr.jpg',
-            'upload_date': '20190604',
-            'categories': ['Psychology'],
+            'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
+            'upload_date': '20220223',
+            'timestamp': 1645632746,
          },
+    }, {
+        # BBC Sounds
+        'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
+        'info_dict': {
+            'id': 'p0hrw4nr',
+            'ext': 'mp4',
+            'title': 'Are our coastlines being washed away?',
+            'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
+            'timestamp': 1713556800,
+            'upload_date': '20240419',
+            'duration': 1588,
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
+            'uploader': 'World Service',
+            'uploader_id': 'bbc_world_service',
+            'series': 'CrowdScience',
+            'chapters': [],
+        },
+    }, {  # onion routes
+        'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
+        'only_matching': True,
      }]
  
      @classmethod
      def suitable(cls, url):
          EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
          return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
-                else super(BBCIE, cls).suitable(url))
+                else super().suitable(url))
  
      def _extract_from_media_meta(self, media_meta, video_id):
          # Direct links to media in media metadata (e.g.
@@ -861,7 +929,6 @@ def _extract_from_media_meta(self, media_meta, video_id):
      def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
          programme_id, title, description, duration, formats, subtitles = \
              self._process_legacy_playlist_url(url, playlist_id)
-        self._sort_formats(formats)
          return {
              'id': programme_id,
              'title': title,
@@ -880,13 +947,8 @@ def _real_extract(self, url):
          json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
          timestamp = json_ld_info.get('timestamp')
  
-        playlist_title = json_ld_info.get('title')
-        if not playlist_title:
-            playlist_title = self._og_search_title(
-                webpage, default=None) or self._html_search_regex(
-                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
-            if playlist_title:
-                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+        playlist_title = json_ld_info.get('title') or re.sub(
+            r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
  
          playlist_description = json_ld_info.get(
              'description') or self._og_search_description(webpage, default=None)
@@ -930,7 +992,6 @@ def _real_extract(self, url):
                              duration = int_or_none(items[0].get('duration'))
                              programme_id = items[0].get('vpid')
                              formats, subtitles = self._download_media_selector(programme_id)
-                            self._sort_formats(formats)
                              entries.append({
                                  'id': programme_id,
                                  'title': title,
@@ -948,7 +1009,7 @@ def _real_extract(self, url):
                          if playlist:
                              entry = None
                              for key in ('streaming', 'progressiveDownload'):
-                                playlist_url = playlist.get('%sUrl' % key)
+                                playlist_url = playlist.get(f'{key}Url')
                                  if not playlist_url:
                                      continue
                                  try:
@@ -963,11 +1024,10 @@ def _real_extract(self, url):
                                      # Some playlist URL may fail with 500, at the same time
                                      # the other one may work fine (e.g.
                                      # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
-                                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                                    if isinstance(e.cause, HTTPError) and e.cause.status == 500:
                                          continue
                                      raise
                              if entry:
-                                self._sort_formats(entry['formats'])
                                  entries.append(entry)
  
          if entries:
@@ -975,23 +1035,21 @@ def _real_extract(self, url):
  
          # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
          group_id = self._search_regex(
-            r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
+            rf'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\']({self._ID_REGEX})',
              webpage, 'group id', default=None)
          if group_id:
              return self.url_result(
-                'https://www.bbc.co.uk/programmes/%s' % group_id,
-                ie=BBCCoUkIE.ie_key())
+                f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
  
          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
          programme_id = self._search_regex(
-            [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
-             r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
-             r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
+            [rf'data-(?:video-player|media)-vpid="({self._ID_REGEX})"',
+             rf'<param[^>]+name="externalIdentifier"[^>]+value="({self._ID_REGEX})"',
+             rf'videoId\s*:\s*["\']({self._ID_REGEX})["\']'],
              webpage, 'vpid', default=None)
  
          if programme_id:
              formats, subtitles = self._download_media_selector(programme_id)
-            self._sort_formats(formats)
              # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
              digital_data = self._parse_json(
                  self._search_regex(
@@ -1023,7 +1081,6 @@ def _real_extract(self, url):
              if version_id:
                  title = smp_data['title']
                  formats, subtitles = self._download_media_selector(version_id)
-                self._sort_formats(formats)
                  image_url = smp_data.get('holdingImageURL')
                  display_date = init_data.get('displayDate')
                  topic_title = init_data.get('topicTitle')
@@ -1042,78 +1099,133 @@ def _real_extract(self, url):
                  }
  
          # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
-        # There are several setPayload calls may be present but the video
-        # seems to be always related to the first one
-        morph_payload = self._parse_json(
-            self._search_regex(
-                r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
-                webpage, 'morph payload', default='{}'),
-            playlist_id, fatal=False)
+        # Several setPayload calls may be present but the video(s)
+        # should be in one that mentions leadMedia or videoData
+        morph_payload = self._search_json(
+            r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
+            contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
+            default={})
          if morph_payload:
-            components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
-            for component in components:
-                if not isinstance(component, dict):
-                    continue
-                lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
-                if not lead_media:
-                    continue
-                identifiers = lead_media.get('identifiers')
-                if not identifiers or not isinstance(identifiers, dict):
-                    continue
-                programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
+            for lead_media in traverse_obj(morph_payload, (
+                    'body', 'components', ..., 'props', 'leadMedia', {dict})):
+                programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
                  if not programme_id:
                      continue
-                title = lead_media.get('title') or self._og_search_title(webpage)
                  formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
-                description = lead_media.get('summary')
-                uploader = lead_media.get('masterBrand')
-                uploader_id = lead_media.get('mid')
-                duration = None
-                duration_d = lead_media.get('duration')
-                if isinstance(duration_d, dict):
-                    duration = parse_duration(dict_get(
-                        duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
                  return {
                      'id': programme_id,
-                    'title': title,
-                    'description': description,
-                    'duration': duration,
-                    'uploader': uploader,
-                    'uploader_id': uploader_id,
+                    'title': lead_media.get('title') or self._og_search_title(webpage),
+                    **traverse_obj(lead_media, {
+                        'description': ('summary', {str}),
+                        'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
+                        'uploader': ('masterBrand', {str}),
+                        'uploader_id': ('mid', {str}),
+                    }),
                      'formats': formats,
                      'subtitles': subtitles,
                  }
+            body = self._parse_json(traverse_obj(morph_payload, (
+                'body', 'content', 'article', 'body')), playlist_id, fatal=False)
+            for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
+                if video_data.get('vpid'):
+                    video_id = video_data['vpid']
+                    formats, subtitles = self._download_media_selector(video_id)
+                    entry = {
+                        'id': video_id,
+                        'formats': formats,
+                        'subtitles': subtitles,
+                    }
+                else:
+                    video_id = video_data['pid']
+                    entry = self.url_result(
+                        f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
+                        video_id, url_transparent=True)
+                entry.update({
+                    'timestamp': traverse_obj(morph_payload, (
+                        'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}),
+                    ),
+                    **traverse_obj(video_data, {
+                        'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
+                        'title': (('title', 'caption'), {str}, any),
+                        'duration': ('duration', {parse_duration}),
+                    }),
+                })
+                if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
+                    return entry
+                entries.append(entry)
+            if entries:
+                playlist_title = traverse_obj(morph_payload, (
+                    'body', 'content', 'article', 'headline', {str})) or playlist_title
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
  
-        preload_state = self._parse_json(self._search_regex(
-            r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
-            'preload state', default='{}'), playlist_id, fatal=False)
-        if preload_state:
-            current_programme = preload_state.get('programmes', {}).get('current') or {}
-            programme_id = current_programme.get('id')
-            if current_programme and programme_id and current_programme.get('type') == 'playable_item':
-                title = current_programme.get('titles', {}).get('tertiary') or playlist_title
-                formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
-                synopses = current_programme.get('synopses') or {}
-                network = current_programme.get('network') or {}
-                duration = int_or_none(
-                    current_programme.get('duration', {}).get('value'))
-                thumbnail = None
-                image_url = current_programme.get('image_url')
-                if image_url:
-                    thumbnail = image_url.replace('{recipe}', 'raw')
+        # various PRELOADED_STATE JSON
+        preload_state = self._search_json(
+            r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
+            'preload state', playlist_id, transform_source=js_to_json, default={})
+        # PRELOADED_STATE with current programmme
+        current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
+        programme_id = traverse_obj(current_programme, ('id', {str}))
+        if programme_id and current_programme.get('type') == 'playable_item':
+            title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
+            formats, subtitles = self._download_media_selector(programme_id)
+            return {
+                'id': programme_id,
+                'title': title,
+                'formats': formats,
+                **traverse_obj(current_programme, {
+                    'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+                    'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
+                    'duration': ('duration', 'value', {int_or_none}),
+                    'uploader': ('network', 'short_title', {str}),
+                    'uploader_id': ('network', 'id', {str}),
+                    'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
+                    'series': ('titles', 'primary', {str}),
+                }),
+                'subtitles': subtitles,
+                'chapters': traverse_obj(preload_state, (
+                    'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
+                        'title': ('titles', {lambda x: join_nonempty(
+                            'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
+                        'start_time': ('offset', 'start', {float_or_none}),
+                        'end_time': ('offset', 'end', {float_or_none}),
+                    }),
+                ),
+            }
+
+        # PWA_PRELOADED_STATE with article video asset
+        asset_id = traverse_obj(preload_state, (
+            'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
+            'assetVideo', 0, {str}, any))
+        if asset_id:
+            video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
+            if video_id:
+                article = traverse_obj(preload_state, (
+                    'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
+
+                def image_url(image_id):
+                    return traverse_obj(preload_state, (
+                        'entities', 'images', image_id, 'url',
+                        {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
+
+                formats, subtitles = self._download_media_selector(video_id)
                  return {
-                    'id': programme_id,
-                    'title': title,
-                    'description': dict_get(synopses, ('long', 'medium', 'short')),
-                    'thumbnail': thumbnail,
-                    'duration': duration,
-                    'uploader': network.get('short_title'),
-                    'uploader_id': network.get('id'),
+                    'id': video_id,
+                    **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
+                        'title': ('title', {str}),
+                        'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
+                        'thumbnail': (0, {image_url}),
+                        'duration': ('duration', {int_or_none}),
+                    })),
                      'formats': formats,
                      'subtitles': subtitles,
+                    'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
                  }
+            else:
+                return self.url_result(
+                    f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
+                    asset_id, playlist_title, display_id=playlist_id,
+                    description=playlist_description)
  
          bbc3_config = self._parse_json(
              self._search_regex(
@@ -1127,7 +1239,6 @@ def _real_extract(self, url):
              clip_title = clip.get('title')
              if clip_vpid and clip_title:
                  formats, subtitles = self._download_media_selector(clip_vpid)
-                self._sort_formats(formats)
                  return {
                      'id': clip_vpid,
                      'title': clip_title,
@@ -1149,7 +1260,6 @@ def _real_extract(self, url):
                      if not programme_id:
                          continue
                      formats, subtitles = self._download_media_selector(programme_id)
-                    self._sort_formats(formats)
                      entries.append({
                          'id': programme_id,
                          'title': playlist_title,
@@ -1161,10 +1271,52 @@ def _real_extract(self, url):
                  return self.playlist_result(
                      entries, playlist_id, playlist_title, playlist_description)
  
-        initial_data = self._parse_json(self._search_regex(
-            r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
-            'preload state', default='{}'), playlist_id, fatal=False)
+        def parse_model(model):
+            """Extract single video from model structure"""
+            item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
+            if not item_id:
+                return
+            formats, subtitles = self._download_media_selector(item_id)
+            return {
+                'id': item_id,
+                'formats': formats,
+                'subtitles': subtitles,
+                **traverse_obj(model, {
+                    'title': ('title', {str}),
+                    'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+                    'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
+                    'duration': ('versions', 0, 'duration', {int}),
+                    'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
+                }),
+            }
+
+        def is_type(*types):
+            return lambda _, v: v['type'] in types
+
+        initial_data = self._search_regex(
+            r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
+            'quoted preload state', default=None)
+        if initial_data is None:
+            initial_data = self._search_regex(
+                r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
+                'preload state', default='{}')
+        else:
+            initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
+        initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
          if initial_data:
+            for video_data in traverse_obj(initial_data, (
+                    'stores', 'article', 'articleBodyContent', is_type('video'))):
+                model = traverse_obj(video_data, (
+                    'model', 'blocks', is_type('aresMedia'),
+                    'model', 'blocks', is_type('aresMediaMetadata'),
+                    'model', {dict}, any))
+                entry = parse_model(model)
+                if entry:
+                    entries.append(entry)
+            if entries:
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
              def parse_media(media):
                  if not media:
                      return
@@ -1174,13 +1326,12 @@ def parse_media(media):
                      if not (item_id and item_title):
                          continue
                      formats, subtitles = self._download_media_selector(item_id)
-                    self._sort_formats(formats)
                      item_desc = None
                      blocks = try_get(media, lambda x: x['summary']['blocks'], list)
                      if blocks:
                          summary = []
                          for block in blocks:
-                            text = try_get(block, lambda x: x['model']['text'], compat_str)
+                            text = try_get(block, lambda x: x['model']['text'], str)
                              if text:
                                  summary.append(text)
                          if summary:
@@ -1198,34 +1349,100 @@ def parse_media(media):
                          'subtitles': subtitles,
                          'timestamp': item_time,
                          'description': strip_or_none(item_desc),
+                        'duration': int_or_none(item.get('duration')),
                      })
-            for resp in (initial_data.get('data') or {}).values():
-                name = resp.get('name')
+
+            for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
+                name = resp['name']
                  if name == 'media-experience':
                      parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
                  elif name == 'article':
-                    for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
-                        if block.get('type') != 'media':
-                            continue
-                        parse_media(block.get('model'))
+                    for block in traverse_obj(resp, (
+                            'data', (None, ('content', 'model')), 'blocks',
+                            is_type('media', 'video'), 'model', {dict})):
+                        parse_media(block)
              return self.playlist_result(
                  entries, playlist_id, playlist_title, playlist_description)
  
+        # extract from SIMORGH_DATA hydration JSON
+        simorgh_data = self._search_json(
+            r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
+            'simorgh data', playlist_id, default={})
+        if simorgh_data:
+            done = False
+            for video_data in traverse_obj(simorgh_data, (
+                    'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
+                model = traverse_obj(video_data, (
+                    'model', 'blocks', is_type('aresMedia'),
+                    'model', 'blocks', is_type('aresMediaMetadata'),
+                    'model', {dict}, any))
+                if video_data['type'] == 'video':
+                    entry = parse_model(model)
+                else:  # legacyMedia: no duration, subtitles
+                    block_id, entry = traverse_obj(model, ('blockId', {str})), None
+                    media_data = traverse_obj(simorgh_data, (
+                        'pageData', 'promo', 'media',
+                        {lambda x: x if x['id'] == block_id else None}))
+                    formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
+                        'url': ('url', {url_or_none}),
+                        'ext': ('format', {str}),
+                        'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
+                    }))
+                    if formats:
+                        entry = {
+                            'id': block_id,
+                            'display_id': playlist_id,
+                            'formats': formats,
+                            'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
+                            **traverse_obj(model, {
+                                'title': ('title', {str}),
+                                'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+                                'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+                                'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
+                            }),
+                        }
+                        done = True
+                if entry:
+                    entries.append(entry)
+                if done:
+                    break
+            if entries:
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
          def extract_all(pattern):
-            return list(filter(None, map(
-                lambda s: self._parse_json(s, playlist_id, fatal=False),
-                re.findall(pattern, webpage))))
+            return list(filter(None, (
+                self._parse_json(s, playlist_id, fatal=False)
+                for s in re.findall(pattern, webpage))))
+
+        # US accessed article with single embedded video (e.g.
+        # https://www.bbc.com/news/uk-68546268)
+        next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
+                                 ('props', 'pageProps', 'page'))
+        model = traverse_obj(next_data, (
+            ..., 'contents', is_type('video'),
+            'model', 'blocks', is_type('media'),
+            'model', 'blocks', is_type('mediaMetadata'),
+            'model', {dict}, any))
+        if model and (entry := parse_model(model)):
+            if not entry.get('timestamp'):
+                entry['timestamp'] = traverse_obj(next_data, (
+                    ..., 'contents', is_type('timestamp'), 'model',
+                    'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
+            entries.append(entry)
+            return self.playlist_result(
+                entries, playlist_id, playlist_title, playlist_description)
  
          # Multiple video article (e.g.
          # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
-        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
+        EMBED_URL = rf'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+{self._ID_REGEX}(?:\b[^"]+)?'
          entries = []
          for match in extract_all(r'new\s+SMP\(({.+?})\)'):
              embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
              if embed_url and re.match(EMBED_URL, embed_url):
                  entries.append(embed_url)
          entries.extend(re.findall(
-            r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+            rf'setPlaylist\("({EMBED_URL})"\)', webpage))
          if entries:
              return self.playlist_result(
                  [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
@@ -1272,15 +1489,14 @@ def extract_all(pattern):
              formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
              if not formats and not self.get_param('ignore_no_formats'):
                  continue
-            self._sort_formats(formats)
  
              video_id = media_meta.get('externalId')
              if not video_id:
-                video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
+                video_id = playlist_id if len(medias) == 1 else f'{playlist_id}-{num}'
  
              title = media_meta.get('caption')
              if not title:
-                title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
+                title = playlist_title if len(medias) == 1 else f'{playlist_title} - Video {num}'
  
              duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
  
@@ -1341,8 +1557,8 @@ def _real_extract(self, url):
  
  class BBCCoUkPlaylistBaseIE(InfoExtractor):
      def _entries(self, webpage, url, playlist_id):
-        single_page = 'page' in compat_urlparse.parse_qs(
-            compat_urlparse.urlparse(url).query)
+        single_page = 'page' in urllib.parse.parse_qs(
+            urllib.parse.urlparse(url).query)
          for page_num in itertools.count(2):
              for video_id in re.findall(
                      self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
@@ -1356,8 +1572,8 @@ def _entries(self, webpage, url, playlist_id):
              if not next_page:
                  break
              webpage = self._download_webpage(
-                compat_urlparse.urljoin(url, next_page), playlist_id,
-                'Downloading page %d' % page_num, page_num)
+                urllib.parse.urljoin(url, next_page), playlist_id,
+                f'Downloading page {page_num}', page_num)
  
      def _real_extract(self, url):
          playlist_id = self._match_id(url)
@@ -1372,7 +1588,7 @@ def _real_extract(self, url):
  
  
  class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
-    _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+    _VALID_URL_TMPL = rf'https?://(?:www\.)?bbc\.co\.uk/iplayer/%s/(?P<id>{BBCCoUkIE._ID_REGEX})'
  
      @staticmethod
      def _get_default(episode, key, default_key='default'):
@@ -1496,11 +1712,11 @@ def _call_api(self, pid, per_page, page=1, series_id=None):
              variables['sliceId'] = series_id
          return self._download_json(
              'https://graph.ibl.api.bbc.co.uk/', pid, headers={
-                'Content-Type': 'application/json'
+                'Content-Type': 'application/json',
              }, data=json.dumps({
                  'id': '5692d93d5aac8d796a0305e895e61551',
                  'variables': variables,
-            }).encode('utf-8'))['data']['programme']
+            }).encode())['data']['programme']
  
      @staticmethod
      def _get_playlist_data(data):
@@ -1560,7 +1776,7 @@ def _get_episode(element):
  
      def _call_api(self, pid, per_page, page=1, series_id=None):
          return self._download_json(
-            'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
+            f'http://ibl.api.bbc.co.uk/ibl/v1/groups/{pid}/episodes',
              pid, query={
                  'page': page,
                  'per_page': per_page,
@@ -1576,7 +1792,7 @@ def _get_playlist_title(self, data):
  
  class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
      IE_NAME = 'bbc.co.uk:playlist'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
+    _VALID_URL = rf'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>{BBCCoUkIE._ID_REGEX})/(?:episodes|broadcasts|clips)'
      _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
      _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
      _TESTS = [{