]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/bbc.py
[ie] Migrate commonly plural fields to lists (#8917)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
index 8231557300866ff071b7219ffda0be9186334daf..015af9e1d616c9b71a1ae0eb6f3b15490b6218fd 100644 (file)
@@ -1,19 +1,12 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import functools
 import itertools
 import json
 import re
+import xml.etree.ElementTree
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_etree_Element,
-    compat_HTTPError,
-    compat_str,
-    compat_urllib_error,
-    compat_urlparse,
-)
+from ..compat import compat_str, compat_urlparse
+from ..networking.exceptions import HTTPError
 from ..utils import (
     ExtractorError,
     OnDemandPagedList,
     float_or_none,
     get_element_by_class,
     int_or_none,
+    join_nonempty,
     js_to_json,
     parse_duration,
     parse_iso8601,
     parse_qs,
     strip_or_none,
+    traverse_obj,
     try_get,
     unescapeHTML,
     unified_timestamp,
@@ -48,11 +43,11 @@ class BBCCoUkIE(InfoExtractor):
                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
                             music/(?:clips|audiovideo/popular)[/#]|
                             radio/player/|
-                            sounds/play/|
                             events/[^/]+/play/[^/]+/
                         )
                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                     ''' % _ID_REGEX
+    _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
 
     _LOGIN_URL = 'https://account.bbc.com/signin'
     _NETRC_MACHINE = 'bbc'
@@ -224,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
-        }, {
-            'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
-            'note': 'Audio',
-            'info_dict': {
-                'id': 'm0007jz9',
-                'ext': 'mp4',
-                'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
-                'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
-                'duration': 9840,
-            },
-            'params': {
-                # rtmp download
-                'skip_download': True,
-            }
         }, {
             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
             'only_matching': True,
@@ -283,7 +264,7 @@ def _perform_login(self, username, password):
             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
             headers={'Referer': self._LOGIN_URL})
 
-        if self._LOGIN_URL in urlh.geturl():
+        if self._LOGIN_URL in urlh.url:
             error = clean_html(get_element_by_class('form-message', response))
             if error:
                 raise ExtractorError(
@@ -318,7 +299,7 @@ def _get_subtitles(self, media, programme_id):
                 continue
             captions = self._download_xml(
                 cc_url, programme_id, 'Downloading captions', fatal=False)
-            if not isinstance(captions, compat_etree_Element):
+            if not isinstance(captions, xml.etree.ElementTree.Element):
                 continue
             subtitles['en'] = [
                 {
@@ -336,16 +317,25 @@ def _raise_extractor_error(self, media_selection_error):
 
     def _download_media_selector(self, programme_id):
         last_exception = None
+        formats, subtitles = [], {}
         for media_set in self._MEDIA_SETS:
             try:
-                return self._download_media_selector_url(
+                fmts, subs = self._download_media_selector_url(
                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
+                formats.extend(fmts)
+                if subs:
+                    self._merge_subtitles(subs, target=subtitles)
             except BBCCoUkIE.MediaSelectionError as e:
                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
                     last_exception = e
                     continue
                 self._raise_extractor_error(e)
-        self._raise_extractor_error(last_exception)
+        if last_exception:
+            if formats or subtitles:
+                self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
+            else:
+                self._raise_extractor_error(last_exception)
+        return formats, subtitles
 
     def _download_media_selector_url(self, url, programme_id=None):
         media_selection = self._download_json(
@@ -394,8 +384,8 @@ def _process_media_selector(self, media_selection, programme_id):
                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
                                 m3u8_id=format_id, fatal=False)
                         except ExtractorError as e:
-                            if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
-                                    and e.exc_info[1].code in (403, 404)):
+                            if not (isinstance(e.exc_info[1], HTTPError)
+                                    and e.exc_info[1].status in (403, 404)):
                                 raise
                             fmts = []
                         formats.extend(fmts)
@@ -478,7 +468,7 @@ def _download_playlist(self, playlist_id):
 
             return programme_id, title, description, duration, formats, subtitles
         except ExtractorError as ee:
-            if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+            if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
                 raise
 
         # fallback to legacy playlist
@@ -581,8 +571,6 @@ def _real_extract(self, url):
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 
-        self._sort_formats(formats)
-
         return {
             'id': programme_id,
             'title': title,
@@ -594,10 +582,15 @@ def _real_extract(self, url):
         }
 
 
-class BBCIE(BBCCoUkIE):
+class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
     IE_NAME = 'bbc'
     IE_DESC = 'BBC'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?(?:
+            bbc\.(?:com|co\.uk)|
+            bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
+            bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
+        )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
 
     _MEDIA_SETS = [
         'pc',
@@ -847,6 +840,26 @@ class BBCIE(BBCCoUkIE):
             'upload_date': '20190604',
             'categories': ['Psychology'],
         },
+    }, {
+        # BBC Sounds
+        'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
+        'info_dict': {
+            'id': 'm001q789',
+            'ext': 'mp4',
+            'title': 'The Night Tracks Mix - Music for the darkling hour',
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
+            'chapters': 'count:8',
+            'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
+            'uploader': 'Radio 3',
+            'duration': 1800,
+            'uploader_id': 'bbc_radio_three',
+        },
+    }, {  # onion routes
+        'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -885,7 +898,6 @@ def _extract_from_media_meta(self, media_meta, video_id):
     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
         programme_id, title, description, duration, formats, subtitles = \
             self._process_legacy_playlist_url(url, playlist_id)
-        self._sort_formats(formats)
         return {
             'id': programme_id,
             'title': title,
@@ -904,13 +916,8 @@ def _real_extract(self, url):
         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
         timestamp = json_ld_info.get('timestamp')
 
-        playlist_title = json_ld_info.get('title')
-        if not playlist_title:
-            playlist_title = self._og_search_title(
-                webpage, default=None) or self._html_search_regex(
-                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
-            if playlist_title:
-                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+        playlist_title = json_ld_info.get('title') or re.sub(
+            r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
 
         playlist_description = json_ld_info.get(
             'description') or self._og_search_description(webpage, default=None)
@@ -954,7 +961,6 @@ def _real_extract(self, url):
                             duration = int_or_none(items[0].get('duration'))
                             programme_id = items[0].get('vpid')
                             formats, subtitles = self._download_media_selector(programme_id)
-                            self._sort_formats(formats)
                             entries.append({
                                 'id': programme_id,
                                 'title': title,
@@ -987,11 +993,10 @@ def _real_extract(self, url):
                                     # Some playlist URL may fail with 500, at the same time
                                     # the other one may work fine (e.g.
                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
-                                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                                    if isinstance(e.cause, HTTPError) and e.cause.status == 500:
                                         continue
                                     raise
                             if entry:
-                                self._sort_formats(entry['formats'])
                                 entries.append(entry)
 
         if entries:
@@ -1015,7 +1020,6 @@ def _real_extract(self, url):
 
         if programme_id:
             formats, subtitles = self._download_media_selector(programme_id)
-            self._sort_formats(formats)
             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
             digital_data = self._parse_json(
                 self._search_regex(
@@ -1047,7 +1051,6 @@ def _real_extract(self, url):
             if version_id:
                 title = smp_data['title']
                 formats, subtitles = self._download_media_selector(version_id)
-                self._sort_formats(formats)
                 image_url = smp_data.get('holdingImageURL')
                 display_date = init_data.get('displayDate')
                 topic_title = init_data.get('topicTitle')
@@ -1089,7 +1092,6 @@ def _real_extract(self, url):
                     continue
                 title = lead_media.get('title') or self._og_search_title(webpage)
                 formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                 description = lead_media.get('summary')
                 uploader = lead_media.get('masterBrand')
                 uploader_id = lead_media.get('mid')
@@ -1118,7 +1120,6 @@ def _real_extract(self, url):
             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
                 formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                 synopses = current_programme.get('synopses') or {}
                 network = current_programme.get('network') or {}
                 duration = int_or_none(
@@ -1137,6 +1138,13 @@ def _real_extract(self, url):
                     'uploader_id': network.get('id'),
                     'formats': formats,
                     'subtitles': subtitles,
+                    'chapters': traverse_obj(preload_state, (
+                        'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
+                            'title': ('titles', {lambda x: join_nonempty(
+                                'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
+                            'start_time': ('offset', 'start', {float_or_none}),
+                            'end_time': ('offset', 'end', {float_or_none}),
+                        })) or None,
                 }
 
         bbc3_config = self._parse_json(
@@ -1151,7 +1159,6 @@ def _real_extract(self, url):
             clip_title = clip.get('title')
             if clip_vpid and clip_title:
                 formats, subtitles = self._download_media_selector(clip_vpid)
-                self._sort_formats(formats)
                 return {
                     'id': clip_vpid,
                     'title': clip_title,
@@ -1173,7 +1180,6 @@ def _real_extract(self, url):
                     if not programme_id:
                         continue
                     formats, subtitles = self._download_media_selector(programme_id)
-                    self._sort_formats(formats)
                     entries.append({
                         'id': programme_id,
                         'title': playlist_title,
@@ -1191,7 +1197,7 @@ def _real_extract(self, url):
         if initial_data is None:
             initial_data = self._search_regex(
                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
-                'preload state', default={})
+                'preload state', default='{}')
         else:
             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
@@ -1205,7 +1211,6 @@ def parse_media(media):
                     if not (item_id and item_title):
                         continue
                     formats, subtitles = self._download_media_selector(item_id)
-                    self._sort_formats(formats)
                     item_desc = None
                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
                     if blocks:
@@ -1239,7 +1244,7 @@ def parse_media(media):
                                           (lambda x: x['data']['blocks'],
                                            lambda x: x['data']['content']['model']['blocks'],),
                                           list) or []):
-                        if block.get('type') != 'media':
+                        if block.get('type') not in ['media', 'video']:
                             continue
                         parse_media(block.get('model'))
             return self.playlist_result(
@@ -1306,7 +1311,6 @@ def extract_all(pattern):
             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
             if not formats and not self.get_param('ignore_no_formats'):
                 continue
-            self._sort_formats(formats)
 
             video_id = media_meta.get('externalId')
             if not video_id: