-# coding: utf-8
-from __future__ import unicode_literals
-
import functools
import itertools
import json
import re
+import xml.etree.ElementTree
from .common import InfoExtractor
-from ..compat import (
- compat_etree_Element,
- compat_HTTPError,
- compat_str,
- compat_urllib_error,
- compat_urlparse,
-)
+from ..compat import compat_str, compat_urlparse
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
OnDemandPagedList,
float_or_none,
get_element_by_class,
int_or_none,
+ join_nonempty,
js_to_json,
parse_duration,
parse_iso8601,
parse_qs,
strip_or_none,
+ traverse_obj,
try_get,
unescapeHTML,
unified_timestamp,
iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/(?:clips|audiovideo/popular)[/#]|
radio/player/|
- sounds/play/|
events/[^/]+/play/[^/]+/
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
+ _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
_LOGIN_URL = 'https://account.bbc.com/signin'
_NETRC_MACHINE = 'bbc'
# rtmp download
'skip_download': True,
},
- }, {
- 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
- 'note': 'Audio',
- 'info_dict': {
- 'id': 'm0007jz9',
- 'ext': 'mp4',
- 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
- 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
- 'duration': 9840,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
}, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True,
'only_matching': True,
}]
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading signin page')
post_url, None, 'Logging in', data=urlencode_postdata(login_form),
headers={'Referer': self._LOGIN_URL})
- if self._LOGIN_URL in urlh.geturl():
+ if self._LOGIN_URL in urlh.url:
error = clean_html(get_element_by_class('form-message', response))
if error:
raise ExtractorError(
'Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
- def _real_initialize(self):
- self._login()
-
class MediaSelectionError(Exception):
def __init__(self, id):
self.id = id
continue
captions = self._download_xml(
cc_url, programme_id, 'Downloading captions', fatal=False)
- if not isinstance(captions, compat_etree_Element):
+ if not isinstance(captions, xml.etree.ElementTree.Element):
continue
subtitles['en'] = [
{
def _download_media_selector(self, programme_id):
last_exception = None
+ formats, subtitles = [], {}
for media_set in self._MEDIA_SETS:
try:
- return self._download_media_selector_url(
+ fmts, subs = self._download_media_selector_url(
self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
+ formats.extend(fmts)
+ if subs:
+ self._merge_subtitles(subs, target=subtitles)
except BBCCoUkIE.MediaSelectionError as e:
if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
last_exception = e
continue
self._raise_extractor_error(e)
- self._raise_extractor_error(last_exception)
+ if last_exception:
+ if formats or subtitles:
+ self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
+ else:
+ self._raise_extractor_error(last_exception)
+ return formats, subtitles
def _download_media_selector_url(self, url, programme_id=None):
media_selection = self._download_json(
href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
except ExtractorError as e:
- if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
- and e.exc_info[1].code in (403, 404)):
+ if not (isinstance(e.exc_info[1], HTTPError)
+ and e.exc_info[1].status in (403, 404)):
raise
fmts = []
formats.extend(fmts)
return programme_id, title, description, duration, formats, subtitles
except ExtractorError as ee:
- if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+ if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
raise
# fallback to legacy playlist
else:
programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
- self._sort_formats(formats)
-
return {
'id': programme_id,
'title': title,
}
-class BBCIE(BBCCoUkIE):
+class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'bbc'
IE_DESC = 'BBC'
- _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?(?:
+ bbc\.(?:com|co\.uk)|
+ bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
+ bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
+ )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
_MEDIA_SETS = [
'pc',
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
- 'title': 'Russia stages massive WW2 parade',
+ 'title': 'Russia stages massive WW2 parade despite Western boycott',
'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
},
'playlist_count': 2,
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
'title': 'BUGGER',
+ 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
},
'playlist_count': 18,
}, {
'info_dict': {
'id': 'p02mprgb',
'ext': 'mp4',
- 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
- 'description': 'md5:2868290467291b37feda7863f7a83f54',
+ 'title': 'Germanwings crash site aerial video',
+ 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
'duration': 47,
'timestamp': 1427219242,
'upload_date': '20150324',
+ 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
},
'params': {
- # rtmp download
'skip_download': True,
}
}, {
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'now SIMORGH_DATA with no video',
}, {
# single video embedded with data-playable containing XML playlists (regional section)
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
'info_dict': {
- 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
+ 'id': '39275083',
+ 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'ext': 'mp4',
'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
- 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
+ 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'timestamp': 1434713142,
'upload_date': '20150619',
+ 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
},
'params': {
'skip_download': True,
- }
+ },
}, {
# single video from video playlist embedded with vxp-playlist-data JSON
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': '404 Not Found',
}, {
- # single video story with digitalData
+ # single video story with __PWA_PRELOADED_STATE__
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
'info_dict': {
'id': 'p02q6gc4',
- 'ext': 'flv',
- 'title': 'Sri Lanka’s spicy secret',
- 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
- 'timestamp': 1437674293,
- 'upload_date': '20150723',
+ 'ext': 'mp4',
+ 'title': 'Tasting the spice of life in Jaffna',
+ 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
+ 'timestamp': 1646058397,
+ 'upload_date': '20220228',
+ 'duration': 255,
+ 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
}, {
# single video story without digitalData
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
'timestamp': 1415867444,
'upload_date': '20141113',
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
+ 'skip': 'redirects to TopGear home page',
}, {
# single video embedded with Morph
+ # TODO: replacement test page
'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
'info_dict': {
'id': 'p041vhd0',
'uploader': 'BBC Sport',
'uploader_id': 'bbc_sport',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- 'skip': 'Georestricted to UK',
+ 'skip': 'Video no longer in page',
}, {
- # single video with playlist.sxml URL in playlist param
+ # single video in __INITIAL_DATA__
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
'ext': 'mp4',
- 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
- 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
+ 'title': 'Ronaldo to Man Utd, Arsenal to spend?',
+ 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
+ 'timestamp': 1437750175,
+ 'upload_date': '20150724',
+ 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
'duration': 140,
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
}, {
- # article with multiple videos embedded with playlist.sxml in playlist param
+ # article with multiple videos embedded with Morph.setPayload
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': '34475836',
'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
},
'playlist_count': 3,
+ }, {
+ # Testing noplaylist
+ 'url': 'http://www.bbc.com/sport/0/football/34475836',
+ 'info_dict': {
+ 'id': 'p034ppnv',
+ 'ext': 'mp4',
+ 'title': 'All you need to know about Jurgen Klopp',
+ 'timestamp': 1444335081,
+ 'upload_date': '20151008',
+ 'duration': 122.0,
+ 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
+ },
+ 'params': {
+ 'noplaylist': True,
+ },
}, {
# school report article with single video
'url': 'http://www.bbc.co.uk/schoolreport/35744779',
'title': 'School which breaks down barriers in Jerusalem',
},
'playlist_count': 1,
+ 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
}, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1437785037,
'upload_date': '20150725',
+ 'duration': 105,
},
}, {
# video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': {
- 'id': 'p0b71qth',
+ 'id': 'p0b779gc',
'ext': 'mp4',
'title': 'Why France is making this woman a national hero',
- 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
+ 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
'thumbnail': r're:https?://.+/.+\.jpg',
- 'timestamp': 1638230731,
- 'upload_date': '20211130',
+ 'timestamp': 1638215626,
+ 'upload_date': '20211129',
+ 'duration': 125,
+ },
+ }, {
+ # video with script id __NEXT_DATA__ and value as JSON string
+ 'url': 'https://www.bbc.com/news/uk-68546268',
+ 'info_dict': {
+ 'id': 'p0hj0lq7',
+ 'ext': 'mp4',
+ 'title': 'Nasser Hospital doctor describes his treatment by IDF',
+ 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ 'timestamp': 1710188248,
+ 'upload_date': '20240311',
+ 'duration': 104,
},
}, {
# single video article embedded with data-media-vpid
'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three',
},
+ 'skip': '404 Not Found',
}, {
'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
'info_dict': {
'ext': 'mp4',
'title': 'md5:2fabf12a726603193a2879a055f72514',
'description': 'Learn English words and phrases from this story',
+ 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
},
'add_ie': [BBCCoUkIE.ie_key()],
}, {
'info_dict': {
'id': 'p07c6sb9',
'ext': 'mp4',
- 'title': 'How positive thinking is harming your happiness',
- 'alt_title': 'The downsides of positive thinking',
- 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+ 'title': 'The downsides of positive thinking',
+ 'description': 'The downsides of positive thinking',
'duration': 235,
- 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
- 'upload_date': '20190604',
- 'categories': ['Psychology'],
+ 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
+ 'upload_date': '20220223',
+ 'timestamp': 1645632746,
},
+ }, {
+ # BBC Sounds
+ 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
+ 'info_dict': {
+ 'id': 'p0hrw4nr',
+ 'ext': 'mp4',
+ 'title': 'Are our coastlines being washed away?',
+ 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
+ 'timestamp': 1713556800,
+ 'upload_date': '20240419',
+ 'duration': 1588,
+ 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
+ 'uploader': 'World Service',
+ 'uploader_id': 'bbc_world_service',
+ 'series': 'CrowdScience',
+ 'chapters': [],
+ }
+ }, { # onion routes
+ 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
+ 'only_matching': True,
}]
@classmethod
def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
programme_id, title, description, duration, formats, subtitles = \
self._process_legacy_playlist_url(url, playlist_id)
- self._sort_formats(formats)
return {
'id': programme_id,
'title': title,
json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
timestamp = json_ld_info.get('timestamp')
- playlist_title = json_ld_info.get('title')
- if not playlist_title:
- playlist_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
- if playlist_title:
- playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+ playlist_title = json_ld_info.get('title') or re.sub(
+ r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
playlist_description = json_ld_info.get(
'description') or self._og_search_description(webpage, default=None)
duration = int_or_none(items[0].get('duration'))
programme_id = items[0].get('vpid')
formats, subtitles = self._download_media_selector(programme_id)
- self._sort_formats(formats)
entries.append({
'id': programme_id,
'title': title,
# Some playlist URL may fail with 500, at the same time
# the other one may work fine (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 500:
continue
raise
if entry:
- self._sort_formats(entry['formats'])
entries.append(entry)
if entries:
webpage, 'group id', default=None)
if group_id:
return self.url_result(
- 'https://www.bbc.co.uk/programmes/%s' % group_id,
- ie=BBCCoUkIE.ie_key())
+ f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
- self._sort_formats(formats)
# digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
digital_data = self._parse_json(
self._search_regex(
if version_id:
title = smp_data['title']
formats, subtitles = self._download_media_selector(version_id)
- self._sort_formats(formats)
image_url = smp_data.get('holdingImageURL')
display_date = init_data.get('displayDate')
topic_title = init_data.get('topicTitle')
}
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
- # There are several setPayload calls may be present but the video
- # seems to be always related to the first one
- morph_payload = self._parse_json(
- self._search_regex(
- r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
- webpage, 'morph payload', default='{}'),
- playlist_id, fatal=False)
+ # Several setPayload calls may be present but the video(s)
+ # should be in one that mentions leadMedia or videoData
+ morph_payload = self._search_json(
+ r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
+ contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
+ default={})
if morph_payload:
- components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
- for component in components:
- if not isinstance(component, dict):
- continue
- lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
- if not lead_media:
- continue
- identifiers = lead_media.get('identifiers')
- if not identifiers or not isinstance(identifiers, dict):
- continue
- programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
+ for lead_media in traverse_obj(morph_payload, (
+ 'body', 'components', ..., 'props', 'leadMedia', {dict})):
+ programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
if not programme_id:
continue
- title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id)
- self._sort_formats(formats)
- description = lead_media.get('summary')
- uploader = lead_media.get('masterBrand')
- uploader_id = lead_media.get('mid')
- duration = None
- duration_d = lead_media.get('duration')
- if isinstance(duration_d, dict):
- duration = parse_duration(dict_get(
- duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
return {
'id': programme_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
+ 'title': lead_media.get('title') or self._og_search_title(webpage),
+ **traverse_obj(lead_media, {
+ 'description': ('summary', {str}),
+ 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
+ 'uploader': ('masterBrand', {str}),
+ 'uploader_id': ('mid', {str}),
+ }),
'formats': formats,
'subtitles': subtitles,
}
+ body = self._parse_json(traverse_obj(morph_payload, (
+ 'body', 'content', 'article', 'body')), playlist_id, fatal=False)
+ for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
+ if video_data.get('vpid'):
+ video_id = video_data['vpid']
+ formats, subtitles = self._download_media_selector(video_id)
+ entry = {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+ else:
+ video_id = video_data['pid']
+ entry = self.url_result(
+ f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
+ video_id, url_transparent=True)
+ entry.update({
+ 'timestamp': traverse_obj(morph_payload, (
+ 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601})
+ ),
+ **traverse_obj(video_data, {
+ 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
+ 'title': (('title', 'caption'), {str}, any),
+ 'duration': ('duration', {parse_duration}),
+ }),
+ })
+ if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
+ return entry
+ entries.append(entry)
+ if entries:
+ playlist_title = traverse_obj(morph_payload, (
+ 'body', 'content', 'article', 'headline', {str})) or playlist_title
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
- preload_state = self._parse_json(self._search_regex(
- r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
- 'preload state', default='{}'), playlist_id, fatal=False)
- if preload_state:
- current_programme = preload_state.get('programmes', {}).get('current') or {}
- programme_id = current_programme.get('id')
- if current_programme and programme_id and current_programme.get('type') == 'playable_item':
- title = current_programme.get('titles', {}).get('tertiary') or playlist_title
- formats, subtitles = self._download_media_selector(programme_id)
- self._sort_formats(formats)
- synopses = current_programme.get('synopses') or {}
- network = current_programme.get('network') or {}
- duration = int_or_none(
- current_programme.get('duration', {}).get('value'))
- thumbnail = None
- image_url = current_programme.get('image_url')
- if image_url:
- thumbnail = image_url.replace('{recipe}', 'raw')
+ # various PRELOADED_STATE JSON
+ preload_state = self._search_json(
+ r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
+ 'preload state', playlist_id, transform_source=js_to_json, default={})
+ # PRELOADED_STATE with current programmme
+ current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
+ programme_id = traverse_obj(current_programme, ('id', {str}))
+ if programme_id and current_programme.get('type') == 'playable_item':
+ title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
+ formats, subtitles = self._download_media_selector(programme_id)
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'formats': formats,
+ **traverse_obj(current_programme, {
+ 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+ 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
+ 'duration': ('duration', 'value', {int_or_none}),
+ 'uploader': ('network', 'short_title', {str}),
+ 'uploader_id': ('network', 'id', {str}),
+ 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
+ 'series': ('titles', 'primary', {str}),
+ }),
+ 'subtitles': subtitles,
+ 'chapters': traverse_obj(preload_state, (
+ 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
+ 'title': ('titles', {lambda x: join_nonempty(
+ 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
+ 'start_time': ('offset', 'start', {float_or_none}),
+ 'end_time': ('offset', 'end', {float_or_none}),
+ })
+ ),
+ }
+
+ # PWA_PRELOADED_STATE with article video asset
+ asset_id = traverse_obj(preload_state, (
+ 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
+ 'assetVideo', 0, {str}, any))
+ if asset_id:
+ video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
+ if video_id:
+ article = traverse_obj(preload_state, (
+ 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
+
+ def image_url(image_id):
+ return traverse_obj(preload_state, (
+ 'entities', 'images', image_id, 'url',
+ {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
+
+ formats, subtitles = self._download_media_selector(video_id)
return {
- 'id': programme_id,
- 'title': title,
- 'description': dict_get(synopses, ('long', 'medium', 'short')),
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'uploader': network.get('short_title'),
- 'uploader_id': network.get('id'),
+ 'id': video_id,
+ **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
+ 'title': ('title', {str}),
+ 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
+ 'thumbnail': (0, {image_url}),
+ 'duration': ('duration', {int_or_none}),
+ })),
'formats': formats,
'subtitles': subtitles,
+ 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
}
+ else:
+ return self.url_result(
+ f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
+ asset_id, playlist_title, display_id=playlist_id,
+ description=playlist_description)
bbc3_config = self._parse_json(
self._search_regex(
clip_title = clip.get('title')
if clip_vpid and clip_title:
formats, subtitles = self._download_media_selector(clip_vpid)
- self._sort_formats(formats)
return {
'id': clip_vpid,
'title': clip_title,
if not programme_id:
continue
formats, subtitles = self._download_media_selector(programme_id)
- self._sort_formats(formats)
entries.append({
'id': programme_id,
'title': playlist_title,
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
+ def parse_model(model):
+ """Extract single video from model structure"""
+ item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
+ if not item_id:
+ return
+ formats, subtitles = self._download_media_selector(item_id)
+ return {
+ 'id': item_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(model, {
+ 'title': ('title', {str}),
+ 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+ 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
+ 'duration': ('versions', 0, 'duration', {int}),
+ 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
+ })
+ }
+
+ def is_type(*types):
+ return lambda _, v: v['type'] in types
+
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None)
if initial_data is None:
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
- 'preload state', default={})
+ 'preload state', default='{}')
else:
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
+ for video_data in traverse_obj(initial_data, (
+ 'stores', 'article', 'articleBodyContent', is_type('video'))):
+ model = traverse_obj(video_data, (
+ 'model', 'blocks', is_type('aresMedia'),
+ 'model', 'blocks', is_type('aresMediaMetadata'),
+ 'model', {dict}, any))
+ entry = parse_model(model)
+ if entry:
+ entries.append(entry)
+ if entries:
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
def parse_media(media):
if not media:
return
if not (item_id and item_title):
continue
formats, subtitles = self._download_media_selector(item_id)
- self._sort_formats(formats)
item_desc = None
blocks = try_get(media, lambda x: x['summary']['blocks'], list)
if blocks:
'subtitles': subtitles,
'timestamp': item_time,
'description': strip_or_none(item_desc),
+ 'duration': int_or_none(item.get('duration')),
})
- for resp in (initial_data.get('data') or {}).values():
- name = resp.get('name')
+
+ for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
+ name = resp['name']
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
- for block in (try_get(resp,
- (lambda x: x['data']['blocks'],
- lambda x: x['data']['content']['model']['blocks'],),
- list) or []):
- if block.get('type') != 'media':
- continue
- parse_media(block.get('model'))
+ for block in traverse_obj(resp, (
+ 'data', (None, ('content', 'model')), 'blocks',
+ is_type('media', 'video'), 'model', {dict})):
+ parse_media(block)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
+ # extract from SIMORGH_DATA hydration JSON
+ simorgh_data = self._search_json(
+ r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
+ 'simorgh data', playlist_id, default={})
+ if simorgh_data:
+ done = False
+ for video_data in traverse_obj(simorgh_data, (
+ 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
+ model = traverse_obj(video_data, (
+ 'model', 'blocks', is_type('aresMedia'),
+ 'model', 'blocks', is_type('aresMediaMetadata'),
+ 'model', {dict}, any))
+ if video_data['type'] == 'video':
+ entry = parse_model(model)
+ else: # legacyMedia: no duration, subtitles
+ block_id, entry = traverse_obj(model, ('blockId', {str})), None
+ media_data = traverse_obj(simorgh_data, (
+ 'pageData', 'promo', 'media',
+ {lambda x: x if x['id'] == block_id else None}))
+ formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
+ 'url': ('url', {url_or_none}),
+ 'ext': ('format', {str}),
+ 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
+ }))
+ if formats:
+ entry = {
+ 'id': block_id,
+ 'display_id': playlist_id,
+ 'formats': formats,
+ 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
+ **traverse_obj(model, {
+ 'title': ('title', {str}),
+ 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+ 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+ 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
+ }),
+ }
+ done = True
+ if entry:
+ entries.append(entry)
+ if done:
+ break
+ if entries:
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
re.findall(pattern, webpage))))
+ # US accessed article with single embedded video (e.g.
+ # https://www.bbc.com/news/uk-68546268)
+ next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
+ ('props', 'pageProps', 'page'))
+ model = traverse_obj(next_data, (
+ ..., 'contents', is_type('video'),
+ 'model', 'blocks', is_type('media'),
+ 'model', 'blocks', is_type('mediaMetadata'),
+ 'model', {dict}, any))
+ if model and (entry := parse_model(model)):
+ if not entry.get('timestamp'):
+ entry['timestamp'] = traverse_obj(next_data, (
+ ..., 'contents', is_type('timestamp'), 'model',
+ 'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
+ entries.append(entry)
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
# Multiple video article (e.g.
# http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
if not formats and not self.get_param('ignore_no_formats'):
continue
- self._sort_formats(formats)
video_id = media_meta.get('externalId')
if not video_id: