]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/bbc.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / bbc.py
index b664a70075ee7b818e3a1d7854f8187bc76e9292..9d28e70a3aa501947104eef4d01958aecac297a8 100644 (file)
@@ -1,19 +1,12 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import functools
 import itertools
 import json
 import re
+import urllib.error
+import xml.etree.ElementTree
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_etree_Element,
-    compat_HTTPError,
-    compat_str,
-    compat_urllib_error,
-    compat_urlparse,
-)
+from ..compat import compat_HTTPError, compat_str, compat_urlparse
 from ..utils import (
     ExtractorError,
     OnDemandPagedList,
@@ -53,6 +46,7 @@ class BBCCoUkIE(InfoExtractor):
                         )
                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                     ''' % _ID_REGEX
+    _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
 
     _LOGIN_URL = 'https://account.bbc.com/signin'
     _NETRC_MACHINE = 'bbc'
@@ -264,11 +258,7 @@ class BBCCoUkIE(InfoExtractor):
             'only_matching': True,
         }]
 
-    def _login(self):
-        username, password = self._get_login_info()
-        if username is None:
-            return
-
+    def _perform_login(self, username, password):
         login_page = self._download_webpage(
             self._LOGIN_URL, None, 'Downloading signin page')
 
@@ -294,9 +284,6 @@ def _login(self):
                     'Unable to login: %s' % error, expected=True)
             raise ExtractorError('Unable to log in')
 
-    def _real_initialize(self):
-        self._login()
-
     class MediaSelectionError(Exception):
         def __init__(self, id):
             self.id = id
@@ -325,7 +312,7 @@ def _get_subtitles(self, media, programme_id):
                 continue
             captions = self._download_xml(
                 cc_url, programme_id, 'Downloading captions', fatal=False)
-            if not isinstance(captions, compat_etree_Element):
+            if not isinstance(captions, xml.etree.ElementTree.Element):
                 continue
             subtitles['en'] = [
                 {
@@ -401,7 +388,7 @@ def _process_media_selector(self, media_selection, programme_id):
                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
                                 m3u8_id=format_id, fatal=False)
                         except ExtractorError as e:
-                            if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
+                            if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
                                     and e.exc_info[1].code in (403, 404)):
                                 raise
                             fmts = []
@@ -588,8 +575,6 @@ def _real_extract(self, url):
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 
-        self._sort_formats(formats)
-
         return {
             'id': programme_id,
             'title': title,
@@ -601,10 +586,15 @@ def _real_extract(self, url):
         }
 
 
-class BBCIE(BBCCoUkIE):
+class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
     IE_NAME = 'bbc'
     IE_DESC = 'BBC'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?(?:
+            bbc\.(?:com|co\.uk)|
+            bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
+            bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
+        )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
 
     _MEDIA_SETS = [
         'pc',
@@ -854,6 +844,12 @@ class BBCIE(BBCCoUkIE):
             'upload_date': '20190604',
             'categories': ['Psychology'],
         },
+    }, {  # onion routes
+        'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -892,7 +888,6 @@ def _extract_from_media_meta(self, media_meta, video_id):
     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
         programme_id, title, description, duration, formats, subtitles = \
             self._process_legacy_playlist_url(url, playlist_id)
-        self._sort_formats(formats)
         return {
             'id': programme_id,
             'title': title,
@@ -911,13 +906,8 @@ def _real_extract(self, url):
         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
         timestamp = json_ld_info.get('timestamp')
 
-        playlist_title = json_ld_info.get('title')
-        if not playlist_title:
-            playlist_title = self._og_search_title(
-                webpage, default=None) or self._html_search_regex(
-                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
-            if playlist_title:
-                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+        playlist_title = json_ld_info.get('title') or re.sub(
+            r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
 
         playlist_description = json_ld_info.get(
             'description') or self._og_search_description(webpage, default=None)
@@ -961,7 +951,6 @@ def _real_extract(self, url):
                             duration = int_or_none(items[0].get('duration'))
                             programme_id = items[0].get('vpid')
                             formats, subtitles = self._download_media_selector(programme_id)
-                            self._sort_formats(formats)
                             entries.append({
                                 'id': programme_id,
                                 'title': title,
@@ -998,7 +987,6 @@ def _real_extract(self, url):
                                         continue
                                     raise
                             if entry:
-                                self._sort_formats(entry['formats'])
                                 entries.append(entry)
 
         if entries:
@@ -1022,7 +1010,6 @@ def _real_extract(self, url):
 
         if programme_id:
             formats, subtitles = self._download_media_selector(programme_id)
-            self._sort_formats(formats)
             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
             digital_data = self._parse_json(
                 self._search_regex(
@@ -1054,7 +1041,6 @@ def _real_extract(self, url):
             if version_id:
                 title = smp_data['title']
                 formats, subtitles = self._download_media_selector(version_id)
-                self._sort_formats(formats)
                 image_url = smp_data.get('holdingImageURL')
                 display_date = init_data.get('displayDate')
                 topic_title = init_data.get('topicTitle')
@@ -1096,7 +1082,6 @@ def _real_extract(self, url):
                     continue
                 title = lead_media.get('title') or self._og_search_title(webpage)
                 formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                 description = lead_media.get('summary')
                 uploader = lead_media.get('masterBrand')
                 uploader_id = lead_media.get('mid')
@@ -1125,7 +1110,6 @@ def _real_extract(self, url):
             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
                 formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                 synopses = current_programme.get('synopses') or {}
                 network = current_programme.get('network') or {}
                 duration = int_or_none(
@@ -1158,7 +1142,6 @@ def _real_extract(self, url):
             clip_title = clip.get('title')
             if clip_vpid and clip_title:
                 formats, subtitles = self._download_media_selector(clip_vpid)
-                self._sort_formats(formats)
                 return {
                     'id': clip_vpid,
                     'title': clip_title,
@@ -1180,7 +1163,6 @@ def _real_extract(self, url):
                     if not programme_id:
                         continue
                     formats, subtitles = self._download_media_selector(programme_id)
-                    self._sort_formats(formats)
                     entries.append({
                         'id': programme_id,
                         'title': playlist_title,
@@ -1212,7 +1194,6 @@ def parse_media(media):
                     if not (item_id and item_title):
                         continue
                     formats, subtitles = self._download_media_selector(item_id)
-                    self._sort_formats(formats)
                     item_desc = None
                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
                     if blocks:
@@ -1246,7 +1227,7 @@ def parse_media(media):
                                           (lambda x: x['data']['blocks'],
                                            lambda x: x['data']['content']['model']['blocks'],),
                                           list) or []):
-                        if block.get('type') != 'media':
+                        if block.get('type') not in ['media', 'video']:
                             continue
                         parse_media(block.get('model'))
             return self.playlist_result(
@@ -1313,7 +1294,6 @@ def extract_all(pattern):
             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
             if not formats and not self.get_param('ignore_no_formats'):
                 continue
-            self._sort_formats(formats)
 
             video_id = media_meta.get('externalId')
             if not video_id: