]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/bbc.py
[ie/JioSaavn] Add extractors (#8307)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
index 35a7a165caa81e99425a1b38fd82630e5e847cdd..d1d6e04faaf71c49ab61e816bce55c0ebda0e06c 100644 (file)
@@ -2,11 +2,11 @@
 import itertools
 import json
 import re
-import urllib.error
 import xml.etree.ElementTree
 
 from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_str, compat_urlparse
+from ..compat import compat_str, compat_urlparse
+from ..networking.exceptions import HTTPError
 from ..utils import (
     ExtractorError,
     OnDemandPagedList,
     float_or_none,
     get_element_by_class,
     int_or_none,
+    join_nonempty,
     js_to_json,
     parse_duration,
     parse_iso8601,
     parse_qs,
     strip_or_none,
+    traverse_obj,
     try_get,
     unescapeHTML,
     unified_timestamp,
@@ -41,7 +43,6 @@ class BBCCoUkIE(InfoExtractor):
                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
                             music/(?:clips|audiovideo/popular)[/#]|
                             radio/player/|
-                            sounds/play/|
                             events/[^/]+/play/[^/]+/
                         )
                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
@@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
-        }, {
-            'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
-            'note': 'Audio',
-            'info_dict': {
-                'id': 'm0007jz9',
-                'ext': 'mp4',
-                'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
-                'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
-                'duration': 9840,
-            },
-            'params': {
-                # rtmp download
-                'skip_download': True,
-            }
         }, {
             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
             'only_matching': True,
@@ -277,7 +264,7 @@ def _perform_login(self, username, password):
             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
             headers={'Referer': self._LOGIN_URL})
 
-        if self._LOGIN_URL in urlh.geturl():
+        if self._LOGIN_URL in urlh.url:
             error = clean_html(get_element_by_class('form-message', response))
             if error:
                 raise ExtractorError(
@@ -388,8 +375,8 @@ def _process_media_selector(self, media_selection, programme_id):
                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
                                 m3u8_id=format_id, fatal=False)
                         except ExtractorError as e:
-                            if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
-                                    and e.exc_info[1].code in (403, 404)):
+                            if not (isinstance(e.exc_info[1], HTTPError)
+                                    and e.exc_info[1].status in (403, 404)):
                                 raise
                             fmts = []
                         formats.extend(fmts)
@@ -472,7 +459,7 @@ def _download_playlist(self, playlist_id):
 
             return programme_id, title, description, duration, formats, subtitles
         except ExtractorError as ee:
-            if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+            if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
                 raise
 
         # fallback to legacy playlist
@@ -575,8 +562,6 @@ def _real_extract(self, url):
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 
-        self._sort_formats(formats)
-
         return {
             'id': programme_id,
             'title': title,
@@ -846,6 +831,20 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
             'upload_date': '20190604',
             'categories': ['Psychology'],
         },
+    }, {
+        # BBC Sounds
+        'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
+        'info_dict': {
+            'id': 'm001q789',
+            'ext': 'mp4',
+            'title': 'The Night Tracks Mix - Music for the darkling hour',
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
+            'chapters': 'count:8',
+            'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
+            'uploader': 'Radio 3',
+            'duration': 1800,
+            'uploader_id': 'bbc_radio_three',
+        },
     }, {  # onion routes
         'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
         'only_matching': True,
@@ -890,7 +889,6 @@ def _extract_from_media_meta(self, media_meta, video_id):
     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
         programme_id, title, description, duration, formats, subtitles = \
             self._process_legacy_playlist_url(url, playlist_id)
-        self._sort_formats(formats)
         return {
             'id': programme_id,
             'title': title,
@@ -954,7 +952,6 @@ def _real_extract(self, url):
                             duration = int_or_none(items[0].get('duration'))
                             programme_id = items[0].get('vpid')
                             formats, subtitles = self._download_media_selector(programme_id)
-                            self._sort_formats(formats)
                             entries.append({
                                 'id': programme_id,
                                 'title': title,
@@ -987,11 +984,10 @@ def _real_extract(self, url):
                                     # Some playlist URL may fail with 500, at the same time
                                     # the other one may work fine (e.g.
                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
-                                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                                    if isinstance(e.cause, HTTPError) and e.cause.status == 500:
                                         continue
                                     raise
                             if entry:
-                                self._sort_formats(entry['formats'])
                                 entries.append(entry)
 
         if entries:
@@ -1015,7 +1011,6 @@ def _real_extract(self, url):
 
         if programme_id:
             formats, subtitles = self._download_media_selector(programme_id)
-            self._sort_formats(formats)
             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
             digital_data = self._parse_json(
                 self._search_regex(
@@ -1047,7 +1042,6 @@ def _real_extract(self, url):
             if version_id:
                 title = smp_data['title']
                 formats, subtitles = self._download_media_selector(version_id)
-                self._sort_formats(formats)
                 image_url = smp_data.get('holdingImageURL')
                 display_date = init_data.get('displayDate')
                 topic_title = init_data.get('topicTitle')
@@ -1089,7 +1083,6 @@ def _real_extract(self, url):
                     continue
                 title = lead_media.get('title') or self._og_search_title(webpage)
                 formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                 description = lead_media.get('summary')
                 uploader = lead_media.get('masterBrand')
                 uploader_id = lead_media.get('mid')
@@ -1118,7 +1111,6 @@ def _real_extract(self, url):
             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
                 formats, subtitles = self._download_media_selector(programme_id)
-                self._sort_formats(formats)
                 synopses = current_programme.get('synopses') or {}
                 network = current_programme.get('network') or {}
                 duration = int_or_none(
@@ -1137,6 +1129,13 @@ def _real_extract(self, url):
                     'uploader_id': network.get('id'),
                     'formats': formats,
                     'subtitles': subtitles,
+                    'chapters': traverse_obj(preload_state, (
+                        'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
+                            'title': ('titles', {lambda x: join_nonempty(
+                                'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
+                            'start_time': ('offset', 'start', {float_or_none}),
+                            'end_time': ('offset', 'end', {float_or_none}),
+                        })) or None,
                 }
 
         bbc3_config = self._parse_json(
@@ -1151,7 +1150,6 @@ def _real_extract(self, url):
             clip_title = clip.get('title')
             if clip_vpid and clip_title:
                 formats, subtitles = self._download_media_selector(clip_vpid)
-                self._sort_formats(formats)
                 return {
                     'id': clip_vpid,
                     'title': clip_title,
@@ -1173,7 +1171,6 @@ def _real_extract(self, url):
                     if not programme_id:
                         continue
                     formats, subtitles = self._download_media_selector(programme_id)
-                    self._sort_formats(formats)
                     entries.append({
                         'id': programme_id,
                         'title': playlist_title,
@@ -1205,7 +1202,6 @@ def parse_media(media):
                     if not (item_id and item_title):
                         continue
                     formats, subtitles = self._download_media_selector(item_id)
-                    self._sort_formats(formats)
                     item_desc = None
                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
                     if blocks:
@@ -1306,7 +1302,6 @@ def extract_all(pattern):
             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
             if not formats and not self.get_param('ignore_no_formats'):
                 continue
-            self._sort_formats(formats)
 
             video_id = media_meta.get('externalId')
             if not video_id: