[ie/orf:on] Improve extraction (#9677)

[yt-dlp.git] / yt_dlp / extractor / bbc.py
diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py

index 89fce8d5a8715724000a0b2aef21b8a2c63a5f5f..f6b58b361f8760c1a64a47aacb5523dc46fed4b1 100644 (file)
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@@ -2,11 +2,11 @@
  import itertools
  import json
  import re
-import urllib.error
  import xml.etree.ElementTree
  
  from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_str, compat_urlparse
+from ..compat import compat_str, compat_urlparse
+from ..networking.exceptions import HTTPError
  from ..utils import (
      ExtractorError,
      OnDemandPagedList,
@@ -15,11 +15,13 @@
      float_or_none,
      get_element_by_class,
      int_or_none,
+    join_nonempty,
      js_to_json,
      parse_duration,
      parse_iso8601,
      parse_qs,
      strip_or_none,
+    traverse_obj,
      try_get,
      unescapeHTML,
      unified_timestamp,
@@ -41,7 +43,6 @@ class BBCCoUkIE(InfoExtractor):
                              iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
                              music/(?:clips|audiovideo/popular)[/#]|
                              radio/player/|
-                            sounds/play/|
                              events/[^/]+/play/[^/]+/
                          )
                          (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
@@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
                  # rtmp download
                  'skip_download': True,
              },
-        }, {
-            'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
-            'note': 'Audio',
-            'info_dict': {
-                'id': 'm0007jz9',
-                'ext': 'mp4',
-                'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
-                'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
-                'duration': 9840,
-            },
-            'params': {
-                # rtmp download
-                'skip_download': True,
-            }
          }, {
              'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
              'only_matching': True,
@@ -277,7 +264,7 @@ def _perform_login(self, username, password):
              post_url, None, 'Logging in', data=urlencode_postdata(login_form),
              headers={'Referer': self._LOGIN_URL})
  
-        if self._LOGIN_URL in urlh.geturl():
+        if self._LOGIN_URL in urlh.url:
              error = clean_html(get_element_by_class('form-message', response))
              if error:
                  raise ExtractorError(
@@ -330,16 +317,25 @@ def _raise_extractor_error(self, media_selection_error):
  
      def _download_media_selector(self, programme_id):
          last_exception = None
+        formats, subtitles = [], {}
          for media_set in self._MEDIA_SETS:
              try:
-                return self._download_media_selector_url(
+                fmts, subs = self._download_media_selector_url(
                      self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
+                formats.extend(fmts)
+                if subs:
+                    self._merge_subtitles(subs, target=subtitles)
              except BBCCoUkIE.MediaSelectionError as e:
                  if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
                      last_exception = e
                      continue
                  self._raise_extractor_error(e)
-        self._raise_extractor_error(last_exception)
+        if last_exception:
+            if formats or subtitles:
+                self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
+            else:
+                self._raise_extractor_error(last_exception)
+        return formats, subtitles
  
      def _download_media_selector_url(self, url, programme_id=None):
          media_selection = self._download_json(
@@ -388,8 +384,8 @@ def _process_media_selector(self, media_selection, programme_id):
                                  href, programme_id, ext='mp4', entry_protocol='m3u8_native',
                                  m3u8_id=format_id, fatal=False)
                          except ExtractorError as e:
-                            if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
-                                    and e.exc_info[1].code in (403, 404)):
+                            if not (isinstance(e.exc_info[1], HTTPError)
+                                    and e.exc_info[1].status in (403, 404)):
                                  raise
                              fmts = []
                          formats.extend(fmts)
@@ -472,7 +468,7 @@ def _download_playlist(self, playlist_id):
  
              return programme_id, title, description, duration, formats, subtitles
          except ExtractorError as ee:
-            if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+            if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
                  raise
  
          # fallback to legacy playlist
@@ -575,8 +571,6 @@ def _real_extract(self, url):
          else:
              programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  
-        self._sort_formats(formats)
-
          return {
              'id': programme_id,
              'title': title,
@@ -588,10 +582,15 @@ def _real_extract(self, url):
          }
  
  
-class BBCIE(BBCCoUkIE):
+class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
      IE_NAME = 'bbc'
      IE_DESC = 'BBC'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?(?:
+            bbc\.(?:com|co\.uk)|
+            bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
+            bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
+        )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
  
      _MEDIA_SETS = [
          'pc',
@@ -603,7 +602,7 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.com/news/world-europe-32668511',
          'info_dict': {
              'id': 'world-europe-32668511',
-            'title': 'Russia stages massive WW2 parade',
+            'title': 'Russia stages massive WW2 parade despite Western boycott',
              'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
          },
          'playlist_count': 2,
@@ -624,6 +623,7 @@ class BBCIE(BBCCoUkIE):
          'info_dict': {
              'id': '3662a707-0af9-3149-963f-47bea720b460',
              'title': 'BUGGER',
+            'description': r're:BUGGER  The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
          },
          'playlist_count': 18,
      }, {
@@ -632,14 +632,14 @@ class BBCIE(BBCCoUkIE):
          'info_dict': {
              'id': 'p02mprgb',
              'ext': 'mp4',
-            'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
-            'description': 'md5:2868290467291b37feda7863f7a83f54',
+            'title': 'Germanwings crash site aerial video',
+            'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
              'duration': 47,
              'timestamp': 1427219242,
              'upload_date': '20150324',
+            'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
          },
          'params': {
-            # rtmp download
              'skip_download': True,
          }
      }, {
@@ -657,21 +657,24 @@ class BBCIE(BBCCoUkIE):
          },
          'params': {
              'skip_download': True,
-        }
+        },
+        'skip': 'now SIMORGH_DATA with no video',
      }, {
          # single video embedded with data-playable containing XML playlists (regional section)
          'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
          'info_dict': {
-            'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
+            'id': '39275083',
+            'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
              'ext': 'mp4',
              'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
-            'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
+            'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
              'timestamp': 1434713142,
              'upload_date': '20150619',
+            'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
          },
          'params': {
              'skip_download': True,
-        }
+        },
      }, {
          # single video from video playlist embedded with vxp-playlist-data JSON
          'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@@ -684,22 +687,21 @@ class BBCIE(BBCCoUkIE):
          },
          'params': {
              'skip_download': True,
-        }
+        },
+        'skip': '404 Not Found',
      }, {
-        # single video story with digitalData
+        # single video story with __PWA_PRELOADED_STATE__
          'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
          'info_dict': {
              'id': 'p02q6gc4',
-            'ext': 'flv',
-            'title': 'Sri Lanka’s spicy secret',
-            'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
-            'timestamp': 1437674293,
-            'upload_date': '20150723',
+            'ext': 'mp4',
+            'title': 'Tasting the spice of life in Jaffna',
+            'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
+            'timestamp': 1646058397,
+            'upload_date': '20220228',
+            'duration': 255,
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
          },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
      }, {
          # single video story without digitalData
          'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
@@ -711,12 +713,10 @@ class BBCIE(BBCCoUkIE):
              'timestamp': 1415867444,
              'upload_date': '20141113',
          },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
+        'skip': 'redirects to TopGear home page',
      }, {
          # single video embedded with Morph
+        # TODO: replacement test page
          'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
          'info_dict': {
              'id': 'p041vhd0',
@@ -727,27 +727,22 @@ class BBCIE(BBCCoUkIE):
              'uploader': 'BBC Sport',
              'uploader_id': 'bbc_sport',
          },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
-        'skip': 'Georestricted to UK',
+        'skip': 'Video no longer in page',
      }, {
-        # single video with playlist.sxml URL in playlist param
+        # single video in __INITIAL_DATA__
          'url': 'http://www.bbc.com/sport/0/football/33653409',
          'info_dict': {
              'id': 'p02xycnp',
              'ext': 'mp4',
-            'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
-            'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
+            'title': 'Ronaldo to Man Utd, Arsenal to spend?',
+            'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
+            'timestamp': 1437750175,
+            'upload_date': '20150724',
+            'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
              'duration': 140,
          },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
      }, {
-        # article with multiple videos embedded with playlist.sxml in playlist param
+        # article with multiple videos embedded with Morph.setPayload
          'url': 'http://www.bbc.com/sport/0/football/34475836',
          'info_dict': {
              'id': '34475836',
@@ -755,6 +750,21 @@ class BBCIE(BBCCoUkIE):
              'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
          },
          'playlist_count': 3,
+    }, {
+        # Testing noplaylist
+        'url': 'http://www.bbc.com/sport/0/football/34475836',
+        'info_dict': {
+            'id': 'p034ppnv',
+            'ext': 'mp4',
+            'title': 'All you need to know about Jurgen Klopp',
+            'timestamp': 1444335081,
+            'upload_date': '20151008',
+            'duration': 122.0,
+            'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
+        },
+        'params': {
+            'noplaylist': True,
+        },
      }, {
          # school report article with single video
          'url': 'http://www.bbc.co.uk/schoolreport/35744779',
@@ -763,6 +773,7 @@ class BBCIE(BBCCoUkIE):
              'title': 'School which breaks down barriers in Jerusalem',
          },
          'playlist_count': 1,
+        'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
      }, {
          # single video with playlist URL from weather section
          'url': 'http://www.bbc.com/weather/features/33601775',
@@ -779,18 +790,33 @@ class BBCIE(BBCCoUkIE):
              'thumbnail': r're:https?://.+/.+\.jpg',
              'timestamp': 1437785037,
              'upload_date': '20150725',
+            'duration': 105,
          },
      }, {
          # video with window.__INITIAL_DATA__ and value as JSON string
          'url': 'https://www.bbc.com/news/av/world-europe-59468682',
          'info_dict': {
-            'id': 'p0b71qth',
+            'id': 'p0b779gc',
              'ext': 'mp4',
              'title': 'Why France is making this woman a national hero',
-            'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
+            'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
              'thumbnail': r're:https?://.+/.+\.jpg',
-            'timestamp': 1638230731,
-            'upload_date': '20211130',
+            'timestamp': 1638215626,
+            'upload_date': '20211129',
+            'duration': 125,
+        },
+    }, {
+        # video with script id __NEXT_DATA__ and value as JSON string
+        'url': 'https://www.bbc.com/news/uk-68546268',
+        'info_dict': {
+            'id': 'p0hj0lq7',
+            'ext': 'mp4',
+            'title': 'Nasser Hospital doctor describes his treatment by IDF',
+            'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1710188248,
+            'upload_date': '20240311',
+            'duration': 104,
          },
      }, {
          # single video article embedded with data-media-vpid
@@ -818,6 +844,7 @@ class BBCIE(BBCCoUkIE):
              'uploader': 'Radio 3',
              'uploader_id': 'bbc_radio_three',
          },
+        'skip': '404 Not Found',
      }, {
          'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
          'info_dict': {
@@ -825,6 +852,7 @@ class BBCIE(BBCCoUkIE):
              'ext': 'mp4',
              'title': 'md5:2fabf12a726603193a2879a055f72514',
              'description': 'Learn English words and phrases from this story',
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
          },
          'add_ie': [BBCCoUkIE.ie_key()],
      }, {
@@ -833,14 +861,36 @@ class BBCIE(BBCCoUkIE):
          'info_dict': {
              'id': 'p07c6sb9',
              'ext': 'mp4',
-            'title': 'How positive thinking is harming your happiness',
-            'alt_title': 'The downsides of positive thinking',
-            'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+            'title': 'The downsides of positive thinking',
+            'description': 'The downsides of positive thinking',
              'duration': 235,
-            'thumbnail': r're:https?://.+/p07c9dsr.jpg',
-            'upload_date': '20190604',
-            'categories': ['Psychology'],
+            'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
+            'upload_date': '20220223',
+            'timestamp': 1645632746,
          },
+    }, {
+        # BBC Sounds
+        'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
+        'info_dict': {
+            'id': 'p0hrw4nr',
+            'ext': 'mp4',
+            'title': 'Are our coastlines being washed away?',
+            'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
+            'timestamp': 1713556800,
+            'upload_date': '20240419',
+            'duration': 1588,
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
+            'uploader': 'World Service',
+            'uploader_id': 'bbc_world_service',
+            'series': 'CrowdScience',
+            'chapters': [],
+        }
+    }, {  # onion routes
+        'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
+        'only_matching': True,
      }]
  
      @classmethod
@@ -879,7 +929,6 @@ def _extract_from_media_meta(self, media_meta, video_id):
      def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
          programme_id, title, description, duration, formats, subtitles = \
              self._process_legacy_playlist_url(url, playlist_id)
-        self._sort_formats(formats)
          return {
              'id': programme_id,
              'title': title,
@@ -943,7 +992,6 @@ def _real_extract(self, url):
                              duration = int_or_none(items[0].get('duration'))
                              programme_id = items[0].get('vpid')
                              formats, subtitles = self._download_media_selector(programme_id)
-                            self._sort_formats(formats)
                              entries.append({
                                  'id': programme_id,
                                  'title': title,
@@ -976,11 +1024,10 @@ def _real_extract(self, url):
                                      # Some playlist URL may fail with 500, at the same time
                                      # the other one may work fine (e.g.
                                      # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
-                                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                                    if isinstance(e.cause, HTTPError) and e.cause.status == 500:
                                          continue
                                      raise
                              if entry:
-                                self._sort_formats(entry['formats'])
                                  entries.append(entry)
  
          if entries:
@@ -992,8 +1039,7 @@ def _real_extract(self, url):
              webpage, 'group id', default=None)
          if group_id:
              return self.url_result(
-                'https://www.bbc.co.uk/programmes/%s' % group_id,
-                ie=BBCCoUkIE.ie_key())
+                f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
  
          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
          programme_id = self._search_regex(
@@ -1004,7 +1050,6 @@ def _real_extract(self, url):
  
          if programme_id:
              formats, subtitles = self._download_media_selector(programme_id)
-            self._sort_formats(formats)
              # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
              digital_data = self._parse_json(
                  self._search_regex(
@@ -1036,7 +1081,6 @@ def _real_extract(self, url):
              if version_id:
                  title = smp_data['title']
                  formats, subtitles = self._download_media_selector(version_id)
-                self._sort_formats(formats)
                  image_url = smp_data.get('holdingImageURL')
                  display_date = init_data.get('displayDate')
                  topic_title = init_data.get('topicTitle')
@@ -1055,78 +1099,133 @@ def _real_extract(self, url):
                  }
  
          # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
-        # There are several setPayload calls may be present but the video
-        # seems to be always related to the first one
-        morph_payload = self._parse_json(
-            self._search_regex(
-                r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
-                webpage, 'morph payload', default='{}'),
-            playlist_id, fatal=False)
+        # Several setPayload calls may be present but the video(s)
+        # should be in one that mentions leadMedia or videoData
+        morph_payload = self._search_json(
+            r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
+            contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
+            default={})
          if morph_payload:
-            components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
-            for component in components:
-                if not isinstance(component, dict):
-                    continue
-                lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
-                if not lead_media:
-                    continue
-                identifiers = lead_media.get('identifiers')
-                if not identifiers or not isinstance(identifiers, dict):
-                    continue
-                programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
+            for lead_media in traverse_obj(morph_payload, (
+                    'body', 'components', ..., 'props', 'leadMedia', {dict})):
+                programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
                  if not programme_id:
                      continue
-                title = lead_media.get('title') or self._og_search_title(webpage)
                  formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
-                description = lead_media.get('summary')
-                uploader = lead_media.get('masterBrand')
-                uploader_id = lead_media.get('mid')
-                duration = None
-                duration_d = lead_media.get('duration')
-                if isinstance(duration_d, dict):
-                    duration = parse_duration(dict_get(
-                        duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
                  return {
                      'id': programme_id,
-                    'title': title,
-                    'description': description,
-                    'duration': duration,
-                    'uploader': uploader,
-                    'uploader_id': uploader_id,
+                    'title': lead_media.get('title') or self._og_search_title(webpage),
+                    **traverse_obj(lead_media, {
+                        'description': ('summary', {str}),
+                        'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
+                        'uploader': ('masterBrand', {str}),
+                        'uploader_id': ('mid', {str}),
+                    }),
                      'formats': formats,
                      'subtitles': subtitles,
                  }
+            body = self._parse_json(traverse_obj(morph_payload, (
+                'body', 'content', 'article', 'body')), playlist_id, fatal=False)
+            for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
+                if video_data.get('vpid'):
+                    video_id = video_data['vpid']
+                    formats, subtitles = self._download_media_selector(video_id)
+                    entry = {
+                        'id': video_id,
+                        'formats': formats,
+                        'subtitles': subtitles,
+                    }
+                else:
+                    video_id = video_data['pid']
+                    entry = self.url_result(
+                        f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
+                        video_id, url_transparent=True)
+                entry.update({
+                    'timestamp': traverse_obj(morph_payload, (
+                        'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601})
+                    ),
+                    **traverse_obj(video_data, {
+                        'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
+                        'title': (('title', 'caption'), {str}, any),
+                        'duration': ('duration', {parse_duration}),
+                    }),
+                })
+                if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
+                    return entry
+                entries.append(entry)
+            if entries:
+                playlist_title = traverse_obj(morph_payload, (
+                    'body', 'content', 'article', 'headline', {str})) or playlist_title
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
  
-        preload_state = self._parse_json(self._search_regex(
-            r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
-            'preload state', default='{}'), playlist_id, fatal=False)
-        if preload_state:
-            current_programme = preload_state.get('programmes', {}).get('current') or {}
-            programme_id = current_programme.get('id')
-            if current_programme and programme_id and current_programme.get('type') == 'playable_item':
-                title = current_programme.get('titles', {}).get('tertiary') or playlist_title
-                formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
-                synopses = current_programme.get('synopses') or {}
-                network = current_programme.get('network') or {}
-                duration = int_or_none(
-                    current_programme.get('duration', {}).get('value'))
-                thumbnail = None
-                image_url = current_programme.get('image_url')
-                if image_url:
-                    thumbnail = image_url.replace('{recipe}', 'raw')
+        # various PRELOADED_STATE JSON
+        preload_state = self._search_json(
+            r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
+            'preload state', playlist_id, transform_source=js_to_json, default={})
+        # PRELOADED_STATE with current programmme
+        current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
+        programme_id = traverse_obj(current_programme, ('id', {str}))
+        if programme_id and current_programme.get('type') == 'playable_item':
+            title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
+            formats, subtitles = self._download_media_selector(programme_id)
+            return {
+                'id': programme_id,
+                'title': title,
+                'formats': formats,
+                **traverse_obj(current_programme, {
+                    'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+                    'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
+                    'duration': ('duration', 'value', {int_or_none}),
+                    'uploader': ('network', 'short_title', {str}),
+                    'uploader_id': ('network', 'id', {str}),
+                    'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
+                    'series': ('titles', 'primary', {str}),
+                }),
+                'subtitles': subtitles,
+                'chapters': traverse_obj(preload_state, (
+                    'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
+                        'title': ('titles', {lambda x: join_nonempty(
+                            'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
+                        'start_time': ('offset', 'start', {float_or_none}),
+                        'end_time': ('offset', 'end', {float_or_none}),
+                    })
+                ),
+            }
+
+        # PWA_PRELOADED_STATE with article video asset
+        asset_id = traverse_obj(preload_state, (
+            'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
+            'assetVideo', 0, {str}, any))
+        if asset_id:
+            video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
+            if video_id:
+                article = traverse_obj(preload_state, (
+                    'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
+
+                def image_url(image_id):
+                    return traverse_obj(preload_state, (
+                        'entities', 'images', image_id, 'url',
+                        {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
+
+                formats, subtitles = self._download_media_selector(video_id)
                  return {
-                    'id': programme_id,
-                    'title': title,
-                    'description': dict_get(synopses, ('long', 'medium', 'short')),
-                    'thumbnail': thumbnail,
-                    'duration': duration,
-                    'uploader': network.get('short_title'),
-                    'uploader_id': network.get('id'),
+                    'id': video_id,
+                    **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
+                        'title': ('title', {str}),
+                        'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
+                        'thumbnail': (0, {image_url}),
+                        'duration': ('duration', {int_or_none}),
+                    })),
                      'formats': formats,
                      'subtitles': subtitles,
+                    'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
                  }
+            else:
+                return self.url_result(
+                    f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
+                    asset_id, playlist_title, display_id=playlist_id,
+                    description=playlist_description)
  
          bbc3_config = self._parse_json(
              self._search_regex(
@@ -1140,7 +1239,6 @@ def _real_extract(self, url):
              clip_title = clip.get('title')
              if clip_vpid and clip_title:
                  formats, subtitles = self._download_media_selector(clip_vpid)
-                self._sort_formats(formats)
                  return {
                      'id': clip_vpid,
                      'title': clip_title,
@@ -1162,7 +1260,6 @@ def _real_extract(self, url):
                      if not programme_id:
                          continue
                      formats, subtitles = self._download_media_selector(programme_id)
-                    self._sort_formats(formats)
                      entries.append({
                          'id': programme_id,
                          'title': playlist_title,
@@ -1174,17 +1271,52 @@ def _real_extract(self, url):
                  return self.playlist_result(
                      entries, playlist_id, playlist_title, playlist_description)
  
+        def parse_model(model):
+            """Extract single video from model structure"""
+            item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
+            if not item_id:
+                return
+            formats, subtitles = self._download_media_selector(item_id)
+            return {
+                'id': item_id,
+                'formats': formats,
+                'subtitles': subtitles,
+                **traverse_obj(model, {
+                    'title': ('title', {str}),
+                    'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+                    'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
+                    'duration': ('versions', 0, 'duration', {int}),
+                    'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
+                })
+            }
+
+        def is_type(*types):
+            return lambda _, v: v['type'] in types
+
          initial_data = self._search_regex(
              r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
              'quoted preload state', default=None)
          if initial_data is None:
              initial_data = self._search_regex(
                  r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
-                'preload state', default={})
+                'preload state', default='{}')
          else:
              initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
          initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
          if initial_data:
+            for video_data in traverse_obj(initial_data, (
+                    'stores', 'article', 'articleBodyContent', is_type('video'))):
+                model = traverse_obj(video_data, (
+                    'model', 'blocks', is_type('aresMedia'),
+                    'model', 'blocks', is_type('aresMediaMetadata'),
+                    'model', {dict}, any))
+                entry = parse_model(model)
+                if entry:
+                    entries.append(entry)
+            if entries:
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
              def parse_media(media):
                  if not media:
                      return
@@ -1194,7 +1326,6 @@ def parse_media(media):
                      if not (item_id and item_title):
                          continue
                      formats, subtitles = self._download_media_selector(item_id)
-                    self._sort_formats(formats)
                      item_desc = None
                      blocks = try_get(media, lambda x: x['summary']['blocks'], list)
                      if blocks:
@@ -1218,27 +1349,90 @@ def parse_media(media):
                          'subtitles': subtitles,
                          'timestamp': item_time,
                          'description': strip_or_none(item_desc),
+                        'duration': int_or_none(item.get('duration')),
                      })
-            for resp in (initial_data.get('data') or {}).values():
-                name = resp.get('name')
+
+            for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
+                name = resp['name']
                  if name == 'media-experience':
                      parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
                  elif name == 'article':
-                    for block in (try_get(resp,
-                                          (lambda x: x['data']['blocks'],
-                                           lambda x: x['data']['content']['model']['blocks'],),
-                                          list) or []):
-                        if block.get('type') not in ['media', 'video']:
-                            continue
-                        parse_media(block.get('model'))
+                    for block in traverse_obj(resp, (
+                            'data', (None, ('content', 'model')), 'blocks',
+                            is_type('media', 'video'), 'model', {dict})):
+                        parse_media(block)
              return self.playlist_result(
                  entries, playlist_id, playlist_title, playlist_description)
  
+        # extract from SIMORGH_DATA hydration JSON
+        simorgh_data = self._search_json(
+            r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
+            'simorgh data', playlist_id, default={})
+        if simorgh_data:
+            done = False
+            for video_data in traverse_obj(simorgh_data, (
+                    'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
+                model = traverse_obj(video_data, (
+                    'model', 'blocks', is_type('aresMedia'),
+                    'model', 'blocks', is_type('aresMediaMetadata'),
+                    'model', {dict}, any))
+                if video_data['type'] == 'video':
+                    entry = parse_model(model)
+                else:  # legacyMedia: no duration, subtitles
+                    block_id, entry = traverse_obj(model, ('blockId', {str})), None
+                    media_data = traverse_obj(simorgh_data, (
+                        'pageData', 'promo', 'media',
+                        {lambda x: x if x['id'] == block_id else None}))
+                    formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
+                        'url': ('url', {url_or_none}),
+                        'ext': ('format', {str}),
+                        'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
+                    }))
+                    if formats:
+                        entry = {
+                            'id': block_id,
+                            'display_id': playlist_id,
+                            'formats': formats,
+                            'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
+                            **traverse_obj(model, {
+                                'title': ('title', {str}),
+                                'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+                                'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+                                'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
+                            }),
+                        }
+                        done = True
+                if entry:
+                    entries.append(entry)
+                if done:
+                    break
+            if entries:
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
          def extract_all(pattern):
              return list(filter(None, map(
                  lambda s: self._parse_json(s, playlist_id, fatal=False),
                  re.findall(pattern, webpage))))
  
+        # US accessed article with single embedded video (e.g.
+        # https://www.bbc.com/news/uk-68546268)
+        next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
+                                 ('props', 'pageProps', 'page'))
+        model = traverse_obj(next_data, (
+            ..., 'contents', is_type('video'),
+            'model', 'blocks', is_type('media'),
+            'model', 'blocks', is_type('mediaMetadata'),
+            'model', {dict}, any))
+        if model and (entry := parse_model(model)):
+            if not entry.get('timestamp'):
+                entry['timestamp'] = traverse_obj(next_data, (
+                    ..., 'contents', is_type('timestamp'), 'model',
+                    'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
+            entries.append(entry)
+            return self.playlist_result(
+                entries, playlist_id, playlist_title, playlist_description)
+
          # Multiple video article (e.g.
          # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
          EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
@@ -1295,7 +1489,6 @@ def extract_all(pattern):
              formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
              if not formats and not self.get_param('ignore_no_formats'):
                  continue
-            self._sort_formats(formats)
  
              video_id = media_meta.get('externalId')
              if not video_id: