yt_dlp/extractor/bbc.py

   1 import functools
   2 import itertools
   3 import json
   4 import re
   5 import urllib.error
   6 import xml.etree.ElementTree
   7
   8 from .common import InfoExtractor
   9 from ..compat import compat_HTTPError, compat_str, compat_urlparse
  10 from ..utils import (
  11     ExtractorError,
  12     OnDemandPagedList,
  13     clean_html,
  14     dict_get,
  15     float_or_none,
  16     get_element_by_class,
  17     int_or_none,
  18     js_to_json,
  19     parse_duration,
  20     parse_iso8601,
  21     parse_qs,
  22     strip_or_none,
  23     try_get,
  24     unescapeHTML,
  25     unified_timestamp,
  26     url_or_none,
  27     urlencode_postdata,
  28     urljoin,
  29 )
  30
  31
  32 class BBCCoUkIE(InfoExtractor):
  33     IE_NAME = 'bbc.co.uk'
  34     IE_DESC = 'BBC iPlayer'
  35     _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  36     _VALID_URL = r'''(?x)
  37                     https?://
  38                         (?:www\.)?bbc\.co\.uk/
  39                         (?:
  40                             programmes/(?!articles/)|
  41                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  42                             music/(?:clips|audiovideo/popular)[/#]|
  43                             radio/player/|
  44                             sounds/play/|
  45                             events/[^/]+/play/[^/]+/
  46                         )
  47                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  48                     ''' % _ID_REGEX
  49     _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  50
  51     _LOGIN_URL = 'https://account.bbc.com/signin'
  52     _NETRC_MACHINE = 'bbc'
  53
  54     _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  55     _MEDIA_SETS = [
  56         # Provides HQ HLS streams with even better quality that pc mediaset but fails
  57         # with geolocation in some cases when it's even not geo restricted at all (e.g.
  58         # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  59         'iptv-all',
  60         'pc',
  61     ]
  62
  63     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  64
  65     _TESTS = [
  66         {
  67             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  68             'info_dict': {
  69                 'id': 'b039d07m',
  70                 'ext': 'flv',
  71                 'title': 'Kaleidoscope, Leonard Cohen',
  72                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  73             },
  74             'params': {
  75                 # rtmp download
  76                 'skip_download': True,
  77             }
  78         },
  79         {
  80             'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  81             'info_dict': {
  82                 'id': 'b00yng1d',
  83                 'ext': 'flv',
  84                 'title': 'The Man in Black: Series 3: The Printed Name',
  85                 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  86                 'duration': 1800,
  87             },
  88             'params': {
  89                 # rtmp download
  90                 'skip_download': True,
  91             },
  92             'skip': 'Episode is no longer available on BBC iPlayer Radio',
  93         },
  94         {
  95             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  96             'info_dict': {
  97                 'id': 'b00yng1d',
  98                 'ext': 'flv',
  99                 'title': 'The Voice UK: Series 3: Blind Auditions 5',
 100                 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
 101                 'duration': 5100,
 102             },
 103             'params': {
 104                 # rtmp download
 105                 'skip_download': True,
 106             },
 107             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 108         },
 109         {
 110             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
 111             'info_dict': {
 112                 'id': 'b03k3pb7',
 113                 'ext': 'flv',
 114                 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
 115                 'description': '2. Invasion',
 116                 'duration': 3600,
 117             },
 118             'params': {
 119                 # rtmp download
 120                 'skip_download': True,
 121             },
 122             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 123         }, {
 124             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
 125             'info_dict': {
 126                 'id': 'b04v209v',
 127                 'ext': 'flv',
 128                 'title': 'Pete Tong, The Essential New Tune Special',
 129                 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
 130                 'duration': 10800,
 131             },
 132             'params': {
 133                 # rtmp download
 134                 'skip_download': True,
 135             },
 136             'skip': 'Episode is no longer available on BBC iPlayer Radio',
 137         }, {
 138             'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
 139             'note': 'Audio',
 140             'info_dict': {
 141                 'id': 'p022h44j',
 142                 'ext': 'flv',
 143                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
 144                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
 145                 'duration': 227,
 146             },
 147             'params': {
 148                 # rtmp download
 149                 'skip_download': True,
 150             }
 151         }, {
 152             'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
 153             'note': 'Video',
 154             'info_dict': {
 155                 'id': 'p025c103',
 156                 'ext': 'flv',
 157                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
 158                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
 159                 'duration': 226,
 160             },
 161             'params': {
 162                 # rtmp download
 163                 'skip_download': True,
 164             }
 165         }, {
 166             'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
 167             'info_dict': {
 168                 'id': 'p02n76xf',
 169                 'ext': 'flv',
 170                 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
 171                 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
 172                 'duration': 3540,
 173             },
 174             'params': {
 175                 # rtmp download
 176                 'skip_download': True,
 177             },
 178             'skip': 'geolocation',
 179         }, {
 180             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
 181             'info_dict': {
 182                 'id': 'b05zmgw1',
 183                 'ext': 'flv',
 184                 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
 185                 'title': 'Royal Academy Summer Exhibition',
 186                 'duration': 3540,
 187             },
 188             'params': {
 189                 # rtmp download
 190                 'skip_download': True,
 191             },
 192             'skip': 'geolocation',
 193         }, {
 194             # iptv-all mediaset fails with geolocation however there is no geo restriction
 195             # for this programme at all
 196             'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
 197             'info_dict': {
 198                 'id': 'b06rkms3',
 199                 'ext': 'flv',
 200                 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
 201                 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
 202             },
 203             'params': {
 204                 # rtmp download
 205                 'skip_download': True,
 206             },
 207             'skip': 'Now it\'s really geo-restricted',
 208         }, {
 209             # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
 210             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
 211             'info_dict': {
 212                 'id': 'p028bfkj',
 213                 'ext': 'flv',
 214                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 215                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 216             },
 217             'params': {
 218                 # rtmp download
 219                 'skip_download': True,
 220             },
 221         }, {
 222             'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
 223             'note': 'Audio',
 224             'info_dict': {
 225                 'id': 'm0007jz9',
 226                 'ext': 'mp4',
 227                 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
 228                 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
 229                 'duration': 9840,
 230             },
 231             'params': {
 232                 # rtmp download
 233                 'skip_download': True,
 234             }
 235         }, {
 236             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
 237             'only_matching': True,
 238         }, {
 239             'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
 240             'only_matching': True,
 241         }, {
 242             'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
 243             'only_matching': True,
 244         }, {
 245             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
 246             'only_matching': True,
 247         }, {
 248             'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
 249             'only_matching': True,
 250         }, {
 251             'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
 252             'only_matching': True,
 253         }, {
 254             'url': 'https://www.bbc.co.uk/programmes/m00005xn',
 255             'only_matching': True,
 256         }, {
 257             'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
 258             'only_matching': True,
 259         }]
 260
 261     def _perform_login(self, username, password):
 262         login_page = self._download_webpage(
 263             self._LOGIN_URL, None, 'Downloading signin page')
 264
 265         login_form = self._hidden_inputs(login_page)
 266
 267         login_form.update({
 268             'username': username,
 269             'password': password,
 270         })
 271
 272         post_url = urljoin(self._LOGIN_URL, self._search_regex(
 273             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
 274             'post url', default=self._LOGIN_URL, group='url'))
 275
 276         response, urlh = self._download_webpage_handle(
 277             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
 278             headers={'Referer': self._LOGIN_URL})
 279
 280         if self._LOGIN_URL in urlh.geturl():
 281             error = clean_html(get_element_by_class('form-message', response))
 282             if error:
 283                 raise ExtractorError(
 284                     'Unable to login: %s' % error, expected=True)
 285             raise ExtractorError('Unable to log in')
 286
 287     class MediaSelectionError(Exception):
 288         def __init__(self, id):
 289             self.id = id
 290
 291     def _extract_asx_playlist(self, connection, programme_id):
 292         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
 293         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 294
 295     def _extract_items(self, playlist):
 296         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 297
 298     def _extract_medias(self, media_selection):
 299         error = media_selection.get('result')
 300         if error:
 301             raise BBCCoUkIE.MediaSelectionError(error)
 302         return media_selection.get('media') or []
 303
 304     def _extract_connections(self, media):
 305         return media.get('connection') or []
 306
 307     def _get_subtitles(self, media, programme_id):
 308         subtitles = {}
 309         for connection in self._extract_connections(media):
 310             cc_url = url_or_none(connection.get('href'))
 311             if not cc_url:
 312                 continue
 313             captions = self._download_xml(
 314                 cc_url, programme_id, 'Downloading captions', fatal=False)
 315             if not isinstance(captions, xml.etree.ElementTree.Element):
 316                 continue
 317             subtitles['en'] = [
 318                 {
 319                     'url': connection.get('href'),
 320                     'ext': 'ttml',
 321                 },
 322             ]
 323             break
 324         return subtitles
 325
 326     def _raise_extractor_error(self, media_selection_error):
 327         raise ExtractorError(
 328             '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
 329             expected=True)
 330
 331     def _download_media_selector(self, programme_id):
 332         last_exception = None
 333         for media_set in self._MEDIA_SETS:
 334             try:
 335                 return self._download_media_selector_url(
 336                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
 337             except BBCCoUkIE.MediaSelectionError as e:
 338                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
 339                     last_exception = e
 340                     continue
 341                 self._raise_extractor_error(e)
 342         self._raise_extractor_error(last_exception)
 343
 344     def _download_media_selector_url(self, url, programme_id=None):
 345         media_selection = self._download_json(
 346             url, programme_id, 'Downloading media selection JSON',
 347             expected_status=(403, 404))
 348         return self._process_media_selector(media_selection, programme_id)
 349
 350     def _process_media_selector(self, media_selection, programme_id):
 351         formats = []
 352         subtitles = None
 353         urls = []
 354
 355         for media in self._extract_medias(media_selection):
 356             kind = media.get('kind')
 357             if kind in ('video', 'audio'):
 358                 bitrate = int_or_none(media.get('bitrate'))
 359                 encoding = media.get('encoding')
 360                 width = int_or_none(media.get('width'))
 361                 height = int_or_none(media.get('height'))
 362                 file_size = int_or_none(media.get('media_file_size'))
 363                 for connection in self._extract_connections(media):
 364                     href = connection.get('href')
 365                     if href in urls:
 366                         continue
 367                     if href:
 368                         urls.append(href)
 369                     conn_kind = connection.get('kind')
 370                     protocol = connection.get('protocol')
 371                     supplier = connection.get('supplier')
 372                     transfer_format = connection.get('transferFormat')
 373                     format_id = supplier or conn_kind or protocol
 374                     # ASX playlist
 375                     if supplier == 'asx':
 376                         for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
 377                             formats.append({
 378                                 'url': ref,
 379                                 'format_id': 'ref%s_%s' % (i, format_id),
 380                             })
 381                     elif transfer_format == 'dash':
 382                         formats.extend(self._extract_mpd_formats(
 383                             href, programme_id, mpd_id=format_id, fatal=False))
 384                     elif transfer_format == 'hls':
 385                         # TODO: let expected_status be passed into _extract_xxx_formats() instead
 386                         try:
 387                             fmts = self._extract_m3u8_formats(
 388                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
 389                                 m3u8_id=format_id, fatal=False)
 390                         except ExtractorError as e:
 391                             if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
 392                                     and e.exc_info[1].code in (403, 404)):
 393                                 raise
 394                             fmts = []
 395                         formats.extend(fmts)
 396                     elif transfer_format == 'hds':
 397                         formats.extend(self._extract_f4m_formats(
 398                             href, programme_id, f4m_id=format_id, fatal=False))
 399                     else:
 400                         if not supplier and bitrate:
 401                             format_id += '-%d' % bitrate
 402                         fmt = {
 403                             'format_id': format_id,
 404                             'filesize': file_size,
 405                         }
 406                         if kind == 'video':
 407                             fmt.update({
 408                                 'width': width,
 409                                 'height': height,
 410                                 'tbr': bitrate,
 411                                 'vcodec': encoding,
 412                             })
 413                         else:
 414                             fmt.update({
 415                                 'abr': bitrate,
 416                                 'acodec': encoding,
 417                                 'vcodec': 'none',
 418                             })
 419                         if protocol in ('http', 'https'):
 420                             # Direct link
 421                             fmt.update({
 422                                 'url': href,
 423                             })
 424                         elif protocol == 'rtmp':
 425                             application = connection.get('application', 'ondemand')
 426                             auth_string = connection.get('authString')
 427                             identifier = connection.get('identifier')
 428                             server = connection.get('server')
 429                             fmt.update({
 430                                 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
 431                                 'play_path': identifier,
 432                                 'app': '%s?%s' % (application, auth_string),
 433                                 'page_url': 'http://www.bbc.co.uk',
 434                                 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
 435                                 'rtmp_live': False,
 436                                 'ext': 'flv',
 437                             })
 438                         else:
 439                             continue
 440                         formats.append(fmt)
 441             elif kind == 'captions':
 442                 subtitles = self.extract_subtitles(media, programme_id)
 443         return formats, subtitles
 444
 445     def _download_playlist(self, playlist_id):
 446         try:
 447             playlist = self._download_json(
 448                 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
 449                 playlist_id, 'Downloading playlist JSON')
 450             formats = []
 451             subtitles = {}
 452
 453             for version in playlist.get('allAvailableVersions', []):
 454                 smp_config = version['smpConfig']
 455                 title = smp_config['title']
 456                 description = smp_config['summary']
 457                 for item in smp_config['items']:
 458                     kind = item['kind']
 459                     if kind not in ('programme', 'radioProgramme'):
 460                         continue
 461                     programme_id = item.get('vpid')
 462                     duration = int_or_none(item.get('duration'))
 463                     version_formats, version_subtitles = self._download_media_selector(programme_id)
 464                     types = version['types']
 465                     for f in version_formats:
 466                         f['format_note'] = ', '.join(types)
 467                         if any('AudioDescribed' in x for x in types):
 468                             f['language_preference'] = -10
 469                     formats += version_formats
 470                     for tag, subformats in (version_subtitles or {}).items():
 471                         subtitles.setdefault(tag, []).extend(subformats)
 472
 473             return programme_id, title, description, duration, formats, subtitles
 474         except ExtractorError as ee:
 475             if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
 476                 raise
 477
 478         # fallback to legacy playlist
 479         return self._process_legacy_playlist(playlist_id)
 480
 481     def _process_legacy_playlist_url(self, url, display_id):
 482         playlist = self._download_legacy_playlist_url(url, display_id)
 483         return self._extract_from_legacy_playlist(playlist, display_id)
 484
 485     def _process_legacy_playlist(self, playlist_id):
 486         return self._process_legacy_playlist_url(
 487             'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
 488
 489     def _download_legacy_playlist_url(self, url, playlist_id=None):
 490         return self._download_xml(
 491             url, playlist_id, 'Downloading legacy playlist XML')
 492
 493     def _extract_from_legacy_playlist(self, playlist, playlist_id):
 494         no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
 495         if no_items is not None:
 496             reason = no_items.get('reason')
 497             if reason == 'preAvailability':
 498                 msg = 'Episode %s is not yet available' % playlist_id
 499             elif reason == 'postAvailability':
 500                 msg = 'Episode %s is no longer available' % playlist_id
 501             elif reason == 'noMedia':
 502                 msg = 'Episode %s is not currently available' % playlist_id
 503             else:
 504                 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
 505             raise ExtractorError(msg, expected=True)
 506
 507         for item in self._extract_items(playlist):
 508             kind = item.get('kind')
 509             if kind not in ('programme', 'radioProgramme'):
 510                 continue
 511             title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
 512             description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
 513             description = description_el.text if description_el is not None else None
 514
 515             def get_programme_id(item):
 516                 def get_from_attributes(item):
 517                     for p in ('identifier', 'group'):
 518                         value = item.get(p)
 519                         if value and re.match(r'^[pb][\da-z]{7}$', value):
 520                             return value
 521                 get_from_attributes(item)
 522                 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
 523                 if mediator is not None:
 524                     return get_from_attributes(mediator)
 525
 526             programme_id = get_programme_id(item)
 527             duration = int_or_none(item.get('duration'))
 528
 529             if programme_id:
 530                 formats, subtitles = self._download_media_selector(programme_id)
 531             else:
 532                 formats, subtitles = self._process_media_selector(item, playlist_id)
 533                 programme_id = playlist_id
 534
 535         return programme_id, title, description, duration, formats, subtitles
 536
 537     def _real_extract(self, url):
 538         group_id = self._match_id(url)
 539
 540         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 541
 542         error = self._search_regex(
 543             r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
 544             webpage, 'error', default=None)
 545         if error:
 546             raise ExtractorError(error, expected=True)
 547
 548         programme_id = None
 549         duration = None
 550
 551         tviplayer = self._search_regex(
 552             r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
 553             webpage, 'player', default=None)
 554
 555         if tviplayer:
 556             player = self._parse_json(tviplayer, group_id).get('player', {})
 557             duration = int_or_none(player.get('duration'))
 558             programme_id = player.get('vpid')
 559
 560         if not programme_id:
 561             programme_id = self._search_regex(
 562                 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 563
 564         if programme_id:
 565             formats, subtitles = self._download_media_selector(programme_id)
 566             title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 567                 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
 568                  r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
 569             description = self._search_regex(
 570                 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
 571                  r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
 572                 webpage, 'description', default=None)
 573             if not description:
 574                 description = self._html_search_meta('description', webpage)
 575         else:
 576             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 577
 578         self._sort_formats(formats)
 579
 580         return {
 581             'id': programme_id,
 582             'title': title,
 583             'description': description,
 584             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 585             'duration': duration,
 586             'formats': formats,
 587             'subtitles': subtitles,
 588         }
 589
 590
 591 class BBCIE(BBCCoUkIE):
 592     IE_NAME = 'bbc'
 593     IE_DESC = 'BBC'
 594     _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
 595
 596     _MEDIA_SETS = [
 597         'pc',
 598         'mobile-tablet-main',
 599     ]
 600
 601     _TESTS = [{
 602         # article with multiple videos embedded with data-playable containing vpids
 603         'url': 'http://www.bbc.com/news/world-europe-32668511',
 604         'info_dict': {
 605             'id': 'world-europe-32668511',
 606             'title': 'Russia stages massive WW2 parade',
 607             'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
 608         },
 609         'playlist_count': 2,
 610     }, {
 611         # article with multiple videos embedded with data-playable (more videos)
 612         'url': 'http://www.bbc.com/news/business-28299555',
 613         'info_dict': {
 614             'id': 'business-28299555',
 615             'title': 'Farnborough Airshow: Video highlights',
 616             'description': 'BBC reports and video highlights at the Farnborough Airshow.',
 617         },
 618         'playlist_count': 9,
 619         'skip': 'Save time',
 620     }, {
 621         # article with multiple videos embedded with `new SMP()`
 622         # broken
 623         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
 624         'info_dict': {
 625             'id': '3662a707-0af9-3149-963f-47bea720b460',
 626             'title': 'BUGGER',
 627         },
 628         'playlist_count': 18,
 629     }, {
 630         # single video embedded with data-playable containing vpid
 631         'url': 'http://www.bbc.com/news/world-europe-32041533',
 632         'info_dict': {
 633             'id': 'p02mprgb',
 634             'ext': 'mp4',
 635             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 636             'description': 'md5:2868290467291b37feda7863f7a83f54',
 637             'duration': 47,
 638             'timestamp': 1427219242,
 639             'upload_date': '20150324',
 640         },
 641         'params': {
 642             # rtmp download
 643             'skip_download': True,
 644         }
 645     }, {
 646         # article with single video embedded with data-playable containing XML playlist
 647         # with direct video links as progressiveDownloadUrl (for now these are extracted)
 648         # and playlist with f4m and m3u8 as streamingUrl
 649         'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 650         'info_dict': {
 651             'id': '150615_telabyad_kentin_cogu',
 652             'ext': 'mp4',
 653             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
 654             'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
 655             'timestamp': 1434397334,
 656             'upload_date': '20150615',
 657         },
 658         'params': {
 659             'skip_download': True,
 660         }
 661     }, {
 662         # single video embedded with data-playable containing XML playlists (regional section)
 663         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 664         'info_dict': {
 665             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
 666             'ext': 'mp4',
 667             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
 668             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
 669             'timestamp': 1434713142,
 670             'upload_date': '20150619',
 671         },
 672         'params': {
 673             'skip_download': True,
 674         }
 675     }, {
 676         # single video from video playlist embedded with vxp-playlist-data JSON
 677         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
 678         'info_dict': {
 679             'id': 'p02w6qjc',
 680             'ext': 'mp4',
 681             'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 682             'duration': 56,
 683             'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 684         },
 685         'params': {
 686             'skip_download': True,
 687         }
 688     }, {
 689         # single video story with digitalData
 690         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
 691         'info_dict': {
 692             'id': 'p02q6gc4',
 693             'ext': 'flv',
 694             'title': 'Sri Lanka’s spicy secret',
 695             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
 696             'timestamp': 1437674293,
 697             'upload_date': '20150723',
 698         },
 699         'params': {
 700             # rtmp download
 701             'skip_download': True,
 702         }
 703     }, {
 704         # single video story without digitalData
 705         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
 706         'info_dict': {
 707             'id': 'p018zqqg',
 708             'ext': 'mp4',
 709             'title': 'Hyundai Santa Fe Sport: Rock star',
 710             'description': 'md5:b042a26142c4154a6e472933cf20793d',
 711             'timestamp': 1415867444,
 712             'upload_date': '20141113',
 713         },
 714         'params': {
 715             # rtmp download
 716             'skip_download': True,
 717         }
 718     }, {
 719         # single video embedded with Morph
 720         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
 721         'info_dict': {
 722             'id': 'p041vhd0',
 723             'ext': 'mp4',
 724             'title': "Nigeria v Japan - Men's First Round",
 725             'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
 726             'duration': 7980,
 727             'uploader': 'BBC Sport',
 728             'uploader_id': 'bbc_sport',
 729         },
 730         'params': {
 731             # m3u8 download
 732             'skip_download': True,
 733         },
 734         'skip': 'Georestricted to UK',
 735     }, {
 736         # single video with playlist.sxml URL in playlist param
 737         'url': 'http://www.bbc.com/sport/0/football/33653409',
 738         'info_dict': {
 739             'id': 'p02xycnp',
 740             'ext': 'mp4',
 741             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
 742             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
 743             'duration': 140,
 744         },
 745         'params': {
 746             # rtmp download
 747             'skip_download': True,
 748         }
 749     }, {
 750         # article with multiple videos embedded with playlist.sxml in playlist param
 751         'url': 'http://www.bbc.com/sport/0/football/34475836',
 752         'info_dict': {
 753             'id': '34475836',
 754             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
 755             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
 756         },
 757         'playlist_count': 3,
 758     }, {
 759         # school report article with single video
 760         'url': 'http://www.bbc.co.uk/schoolreport/35744779',
 761         'info_dict': {
 762             'id': '35744779',
 763             'title': 'School which breaks down barriers in Jerusalem',
 764         },
 765         'playlist_count': 1,
 766     }, {
 767         # single video with playlist URL from weather section
 768         'url': 'http://www.bbc.com/weather/features/33601775',
 769         'only_matching': True,
 770     }, {
 771         # custom redirection to www.bbc.com
 772         # also, video with window.__INITIAL_DATA__
 773         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
 774         'info_dict': {
 775             'id': 'p02xzws1',
 776             'ext': 'mp4',
 777             'title': "Pluto may have 'nitrogen glaciers'",
 778             'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
 779             'thumbnail': r're:https?://.+/.+\.jpg',
 780             'timestamp': 1437785037,
 781             'upload_date': '20150725',
 782         },
 783     }, {
 784         # video with window.__INITIAL_DATA__ and value as JSON string
 785         'url': 'https://www.bbc.com/news/av/world-europe-59468682',
 786         'info_dict': {
 787             'id': 'p0b71qth',
 788             'ext': 'mp4',
 789             'title': 'Why France is making this woman a national hero',
 790             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
 791             'thumbnail': r're:https?://.+/.+\.jpg',
 792             'timestamp': 1638230731,
 793             'upload_date': '20211130',
 794         },
 795     }, {
 796         # single video article embedded with data-media-vpid
 797         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
 798         'only_matching': True,
 799     }, {
 800         # bbcthreeConfig
 801         'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
 802         'info_dict': {
 803             'id': 'p06556y7',
 804             'ext': 'mp4',
 805             'title': 'Things Not To Say to people that live on council estates',
 806             'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
 807             'duration': 360,
 808             'thumbnail': r're:https?://.+/.+\.jpg',
 809         },
 810     }, {
 811         # window.__PRELOADED_STATE__
 812         'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
 813         'info_dict': {
 814             'id': 'b0b9z4vz',
 815             'ext': 'mp4',
 816             'title': 'Prom 6: An American in Paris and Turangalila',
 817             'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
 818             'uploader': 'Radio 3',
 819             'uploader_id': 'bbc_radio_three',
 820         },
 821     }, {
 822         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
 823         'info_dict': {
 824             'id': 'p06w9tws',
 825             'ext': 'mp4',
 826             'title': 'md5:2fabf12a726603193a2879a055f72514',
 827             'description': 'Learn English words and phrases from this story',
 828         },
 829         'add_ie': [BBCCoUkIE.ie_key()],
 830     }, {
 831         # BBC Reel
 832         'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
 833         'info_dict': {
 834             'id': 'p07c6sb9',
 835             'ext': 'mp4',
 836             'title': 'How positive thinking is harming your happiness',
 837             'alt_title': 'The downsides of positive thinking',
 838             'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
 839             'duration': 235,
 840             'thumbnail': r're:https?://.+/p07c9dsr.jpg',
 841             'upload_date': '20190604',
 842             'categories': ['Psychology'],
 843         },
 844     }]
 845
 846     @classmethod
 847     def suitable(cls, url):
 848         EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
 849         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
 850                 else super(BBCIE, cls).suitable(url))
 851
 852     def _extract_from_media_meta(self, media_meta, video_id):
 853         # Direct links to media in media metadata (e.g.
 854         # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 855         # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
 856         source_files = media_meta.get('sourceFiles')
 857         if source_files:
 858             return [{
 859                 'url': f['url'],
 860                 'format_id': format_id,
 861                 'ext': f.get('encoding'),
 862                 'tbr': float_or_none(f.get('bitrate'), 1000),
 863                 'filesize': int_or_none(f.get('filesize')),
 864             } for format_id, f in source_files.items() if f.get('url')], []
 865
 866         programme_id = media_meta.get('externalId')
 867         if programme_id:
 868             return self._download_media_selector(programme_id)
 869
 870         # Process playlist.sxml as legacy playlist
 871         href = media_meta.get('href')
 872         if href:
 873             playlist = self._download_legacy_playlist_url(href)
 874             _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
 875             return formats, subtitles
 876
 877         return [], []
 878
 879     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
 880         programme_id, title, description, duration, formats, subtitles = \
 881             self._process_legacy_playlist_url(url, playlist_id)
 882         self._sort_formats(formats)
 883         return {
 884             'id': programme_id,
 885             'title': title,
 886             'description': description,
 887             'duration': duration,
 888             'timestamp': timestamp,
 889             'formats': formats,
 890             'subtitles': subtitles,
 891         }
 892
 893     def _real_extract(self, url):
 894         playlist_id = self._match_id(url)
 895
 896         webpage = self._download_webpage(url, playlist_id)
 897
 898         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
 899         timestamp = json_ld_info.get('timestamp')
 900
 901         playlist_title = json_ld_info.get('title')
 902         if not playlist_title:
 903             playlist_title = (self._og_search_title(webpage, default=None)
 904                               or self._html_extract_title(webpage, 'playlist title', default=None))
 905             if playlist_title:
 906                 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
 907
 908         playlist_description = json_ld_info.get(
 909             'description') or self._og_search_description(webpage, default=None)
 910
 911         if not timestamp:
 912             timestamp = parse_iso8601(self._search_regex(
 913                 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
 914                  r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
 915                  r'"datePublished":\s*"([^"]+)'],
 916                 webpage, 'date', default=None))
 917
 918         entries = []
 919
 920         # article with multiple videos embedded with playlist.sxml (e.g.
 921         # http://www.bbc.com/sport/0/football/34475836)
 922         playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
 923         playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
 924         if playlists:
 925             entries = [
 926                 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
 927                 for playlist_url in playlists]
 928
 929         # news article with multiple videos embedded with data-playable
 930         data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
 931         if data_playables:
 932             for _, data_playable_json in data_playables:
 933                 data_playable = self._parse_json(
 934                     unescapeHTML(data_playable_json), playlist_id, fatal=False)
 935                 if not data_playable:
 936                     continue
 937                 settings = data_playable.get('settings', {})
 938                 if settings:
 939                     # data-playable with video vpid in settings.playlistObject.items (e.g.
 940                     # http://www.bbc.com/news/world-us-canada-34473351)
 941                     playlist_object = settings.get('playlistObject', {})
 942                     if playlist_object:
 943                         items = playlist_object.get('items')
 944                         if items and isinstance(items, list):
 945                             title = playlist_object['title']
 946                             description = playlist_object.get('summary')
 947                             duration = int_or_none(items[0].get('duration'))
 948                             programme_id = items[0].get('vpid')
 949                             formats, subtitles = self._download_media_selector(programme_id)
 950                             self._sort_formats(formats)
 951                             entries.append({
 952                                 'id': programme_id,
 953                                 'title': title,
 954                                 'description': description,
 955                                 'timestamp': timestamp,
 956                                 'duration': duration,
 957                                 'formats': formats,
 958                                 'subtitles': subtitles,
 959                             })
 960                     else:
 961                         # data-playable without vpid but with a playlist.sxml URLs
 962                         # in otherSettings.playlist (e.g.
 963                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
 964                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
 965                         if playlist:
 966                             entry = None
 967                             for key in ('streaming', 'progressiveDownload'):
 968                                 playlist_url = playlist.get('%sUrl' % key)
 969                                 if not playlist_url:
 970                                     continue
 971                                 try:
 972                                     info = self._extract_from_playlist_sxml(
 973                                         playlist_url, playlist_id, timestamp)
 974                                     if not entry:
 975                                         entry = info
 976                                     else:
 977                                         entry['title'] = info['title']
 978                                         entry['formats'].extend(info['formats'])
 979                                 except ExtractorError as e:
 980                                     # Some playlist URL may fail with 500, at the same time
 981                                     # the other one may work fine (e.g.
 982                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 983                                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
 984                                         continue
 985                                     raise
 986                             if entry:
 987                                 self._sort_formats(entry['formats'])
 988                                 entries.append(entry)
 989
 990         if entries:
 991             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
 992
 993         # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
 994         group_id = self._search_regex(
 995             r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
 996             webpage, 'group id', default=None)
 997         if group_id:
 998             return self.url_result(
 999                 'https://www.bbc.co.uk/programmes/%s' % group_id,
1000                 ie=BBCCoUkIE.ie_key())
1001
1002         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1003         programme_id = self._search_regex(
1004             [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1005              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1006              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1007             webpage, 'vpid', default=None)
1008
1009         if programme_id:
1010             formats, subtitles = self._download_media_selector(programme_id)
1011             self._sort_formats(formats)
1012             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1013             digital_data = self._parse_json(
1014                 self._search_regex(
1015                     r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1016                 programme_id, fatal=False)
1017             page_info = digital_data.get('page', {}).get('pageInfo', {})
1018             title = page_info.get('pageName') or self._og_search_title(webpage)
1019             description = page_info.get('description') or self._og_search_description(webpage)
1020             timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1021             return {
1022                 'id': programme_id,
1023                 'title': title,
1024                 'description': description,
1025                 'timestamp': timestamp,
1026                 'formats': formats,
1027                 'subtitles': subtitles,
1028             }
1029
1030         # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1031         initial_data = self._parse_json(self._html_search_regex(
1032             r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1033             webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1034         if initial_data:
1035             init_data = try_get(
1036                 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1037             smp_data = init_data.get('smpData') or {}
1038             clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1039             version_id = clip_data.get('versionID')
1040             if version_id:
1041                 title = smp_data['title']
1042                 formats, subtitles = self._download_media_selector(version_id)
1043                 self._sort_formats(formats)
1044                 image_url = smp_data.get('holdingImageURL')
1045                 display_date = init_data.get('displayDate')
1046                 topic_title = init_data.get('topicTitle')
1047
1048                 return {
1049                     'id': version_id,
1050                     'title': title,
1051                     'formats': formats,
1052                     'alt_title': init_data.get('shortTitle'),
1053                     'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1054                     'description': smp_data.get('summary') or init_data.get('shortSummary'),
1055                     'upload_date': display_date.replace('-', '') if display_date else None,
1056                     'subtitles': subtitles,
1057                     'duration': int_or_none(clip_data.get('duration')),
1058                     'categories': [topic_title] if topic_title else None,
1059                 }
1060
1061         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1062         # There are several setPayload calls may be present but the video
1063         # seems to be always related to the first one
1064         morph_payload = self._parse_json(
1065             self._search_regex(
1066                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1067                 webpage, 'morph payload', default='{}'),
1068             playlist_id, fatal=False)
1069         if morph_payload:
1070             components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1071             for component in components:
1072                 if not isinstance(component, dict):
1073                     continue
1074                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1075                 if not lead_media:
1076                     continue
1077                 identifiers = lead_media.get('identifiers')
1078                 if not identifiers or not isinstance(identifiers, dict):
1079                     continue
1080                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1081                 if not programme_id:
1082                     continue
1083                 title = lead_media.get('title') or self._og_search_title(webpage)
1084                 formats, subtitles = self._download_media_selector(programme_id)
1085                 self._sort_formats(formats)
1086                 description = lead_media.get('summary')
1087                 uploader = lead_media.get('masterBrand')
1088                 uploader_id = lead_media.get('mid')
1089                 duration = None
1090                 duration_d = lead_media.get('duration')
1091                 if isinstance(duration_d, dict):
1092                     duration = parse_duration(dict_get(
1093                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1094                 return {
1095                     'id': programme_id,
1096                     'title': title,
1097                     'description': description,
1098                     'duration': duration,
1099                     'uploader': uploader,
1100                     'uploader_id': uploader_id,
1101                     'formats': formats,
1102                     'subtitles': subtitles,
1103                 }
1104
1105         preload_state = self._parse_json(self._search_regex(
1106             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1107             'preload state', default='{}'), playlist_id, fatal=False)
1108         if preload_state:
1109             current_programme = preload_state.get('programmes', {}).get('current') or {}
1110             programme_id = current_programme.get('id')
1111             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1112                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1113                 formats, subtitles = self._download_media_selector(programme_id)
1114                 self._sort_formats(formats)
1115                 synopses = current_programme.get('synopses') or {}
1116                 network = current_programme.get('network') or {}
1117                 duration = int_or_none(
1118                     current_programme.get('duration', {}).get('value'))
1119                 thumbnail = None
1120                 image_url = current_programme.get('image_url')
1121                 if image_url:
1122                     thumbnail = image_url.replace('{recipe}', 'raw')
1123                 return {
1124                     'id': programme_id,
1125                     'title': title,
1126                     'description': dict_get(synopses, ('long', 'medium', 'short')),
1127                     'thumbnail': thumbnail,
1128                     'duration': duration,
1129                     'uploader': network.get('short_title'),
1130                     'uploader_id': network.get('id'),
1131                     'formats': formats,
1132                     'subtitles': subtitles,
1133                 }
1134
1135         bbc3_config = self._parse_json(
1136             self._search_regex(
1137                 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1138                 'bbcthree config', default='{}'),
1139             playlist_id, transform_source=js_to_json, fatal=False) or {}
1140         payload = bbc3_config.get('payload') or {}
1141         if payload:
1142             clip = payload.get('currentClip') or {}
1143             clip_vpid = clip.get('vpid')
1144             clip_title = clip.get('title')
1145             if clip_vpid and clip_title:
1146                 formats, subtitles = self._download_media_selector(clip_vpid)
1147                 self._sort_formats(formats)
1148                 return {
1149                     'id': clip_vpid,
1150                     'title': clip_title,
1151                     'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1152                     'description': clip.get('description'),
1153                     'duration': parse_duration(clip.get('duration')),
1154                     'formats': formats,
1155                     'subtitles': subtitles,
1156                 }
1157             bbc3_playlist = try_get(
1158                 payload, lambda x: x['content']['bbcMedia']['playlist'],
1159                 dict)
1160             if bbc3_playlist:
1161                 playlist_title = bbc3_playlist.get('title') or playlist_title
1162                 thumbnail = bbc3_playlist.get('holdingImageURL')
1163                 entries = []
1164                 for bbc3_item in bbc3_playlist['items']:
1165                     programme_id = bbc3_item.get('versionID')
1166                     if not programme_id:
1167                         continue
1168                     formats, subtitles = self._download_media_selector(programme_id)
1169                     self._sort_formats(formats)
1170                     entries.append({
1171                         'id': programme_id,
1172                         'title': playlist_title,
1173                         'thumbnail': thumbnail,
1174                         'timestamp': timestamp,
1175                         'formats': formats,
1176                         'subtitles': subtitles,
1177                     })
1178                 return self.playlist_result(
1179                     entries, playlist_id, playlist_title, playlist_description)
1180
1181         initial_data = self._search_regex(
1182             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1183             'quoted preload state', default=None)
1184         if initial_data is None:
1185             initial_data = self._search_regex(
1186                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1187                 'preload state', default={})
1188         else:
1189             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1190         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1191         if initial_data:
1192             def parse_media(media):
1193                 if not media:
1194                     return
1195                 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1196                     item_id = item.get('id')
1197                     item_title = item.get('title')
1198                     if not (item_id and item_title):
1199                         continue
1200                     formats, subtitles = self._download_media_selector(item_id)
1201                     self._sort_formats(formats)
1202                     item_desc = None
1203                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1204                     if blocks:
1205                         summary = []
1206                         for block in blocks:
1207                             text = try_get(block, lambda x: x['model']['text'], compat_str)
1208                             if text:
1209                                 summary.append(text)
1210                         if summary:
1211                             item_desc = '\n\n'.join(summary)
1212                     item_time = None
1213                     for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1214                         if try_get(meta, lambda x: x['label']) == 'Published':
1215                             item_time = unified_timestamp(meta.get('timestamp'))
1216                             break
1217                     entries.append({
1218                         'id': item_id,
1219                         'title': item_title,
1220                         'thumbnail': item.get('holdingImageUrl'),
1221                         'formats': formats,
1222                         'subtitles': subtitles,
1223                         'timestamp': item_time,
1224                         'description': strip_or_none(item_desc),
1225                     })
1226             for resp in (initial_data.get('data') or {}).values():
1227                 name = resp.get('name')
1228                 if name == 'media-experience':
1229                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1230                 elif name == 'article':
1231                     for block in (try_get(resp,
1232                                           (lambda x: x['data']['blocks'],
1233                                            lambda x: x['data']['content']['model']['blocks'],),
1234                                           list) or []):
1235                         if block.get('type') not in ['media', 'video']:
1236                             continue
1237                         parse_media(block.get('model'))
1238             return self.playlist_result(
1239                 entries, playlist_id, playlist_title, playlist_description)
1240
1241         def extract_all(pattern):
1242             return list(filter(None, map(
1243                 lambda s: self._parse_json(s, playlist_id, fatal=False),
1244                 re.findall(pattern, webpage))))
1245
1246         # Multiple video article (e.g.
1247         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1248         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1249         entries = []
1250         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1251             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1252             if embed_url and re.match(EMBED_URL, embed_url):
1253                 entries.append(embed_url)
1254         entries.extend(re.findall(
1255             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1256         if entries:
1257             return self.playlist_result(
1258                 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1259                 playlist_id, playlist_title, playlist_description)
1260
1261         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1262         medias = extract_all(r"data-media-meta='({[^']+})'")
1263
1264         if not medias:
1265             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1266             media_asset = self._search_regex(
1267                 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1268                 webpage, 'media asset', default=None)
1269             if media_asset:
1270                 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1271                 medias = []
1272                 for video in media_asset_page.get('videos', {}).values():
1273                     medias.extend(video.values())
1274
1275         if not medias:
1276             # Multiple video playlist with single `now playing` entry (e.g.
1277             # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1278             vxp_playlist = self._parse_json(
1279                 self._search_regex(
1280                     r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1281                     webpage, 'playlist data'),
1282                 playlist_id)
1283             playlist_medias = []
1284             for item in vxp_playlist:
1285                 media = item.get('media')
1286                 if not media:
1287                     continue
1288                 playlist_medias.append(media)
1289                 # Download single video if found media with asset id matching the video id from URL
1290                 if item.get('advert', {}).get('assetId') == playlist_id:
1291                     medias = [media]
1292                     break
1293             # Fallback to the whole playlist
1294             if not medias:
1295                 medias = playlist_medias
1296
1297         entries = []
1298         for num, media_meta in enumerate(medias, start=1):
1299             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1300             if not formats and not self.get_param('ignore_no_formats'):
1301                 continue
1302             self._sort_formats(formats)
1303
1304             video_id = media_meta.get('externalId')
1305             if not video_id:
1306                 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1307
1308             title = media_meta.get('caption')
1309             if not title:
1310                 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1311
1312             duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1313
1314             images = []
1315             for image in media_meta.get('images', {}).values():
1316                 images.extend(image.values())
1317             if 'image' in media_meta:
1318                 images.append(media_meta['image'])
1319
1320             thumbnails = [{
1321                 'url': image.get('href'),
1322                 'width': int_or_none(image.get('width')),
1323                 'height': int_or_none(image.get('height')),
1324             } for image in images]
1325
1326             entries.append({
1327                 'id': video_id,
1328                 'title': title,
1329                 'thumbnails': thumbnails,
1330                 'duration': duration,
1331                 'timestamp': timestamp,
1332                 'formats': formats,
1333                 'subtitles': subtitles,
1334             })
1335
1336         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1337
1338
1339 class BBCCoUkArticleIE(InfoExtractor):
1340     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1341     IE_NAME = 'bbc.co.uk:article'
1342     IE_DESC = 'BBC articles'
1343
1344     _TEST = {
1345         'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1346         'info_dict': {
1347             'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1348             'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1349             'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1350         },
1351         'playlist_count': 4,
1352         'add_ie': ['BBCCoUk'],
1353     }
1354
1355     def _real_extract(self, url):
1356         playlist_id = self._match_id(url)
1357
1358         webpage = self._download_webpage(url, playlist_id)
1359
1360         title = self._og_search_title(webpage)
1361         description = self._og_search_description(webpage).strip()
1362
1363         entries = [self.url_result(programme_url) for programme_url in re.findall(
1364             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1365
1366         return self.playlist_result(entries, playlist_id, title, description)
1367
1368
1369 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1370     def _entries(self, webpage, url, playlist_id):
1371         single_page = 'page' in compat_urlparse.parse_qs(
1372             compat_urlparse.urlparse(url).query)
1373         for page_num in itertools.count(2):
1374             for video_id in re.findall(
1375                     self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1376                 yield self.url_result(
1377                     self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1378             if single_page:
1379                 return
1380             next_page = self._search_regex(
1381                 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1382                 webpage, 'next page url', default=None, group='url')
1383             if not next_page:
1384                 break
1385             webpage = self._download_webpage(
1386                 compat_urlparse.urljoin(url, next_page), playlist_id,
1387                 'Downloading page %d' % page_num, page_num)
1388
1389     def _real_extract(self, url):
1390         playlist_id = self._match_id(url)
1391
1392         webpage = self._download_webpage(url, playlist_id)
1393
1394         title, description = self._extract_title_and_description(webpage)
1395
1396         return self.playlist_result(
1397             self._entries(webpage, url, playlist_id),
1398             playlist_id, title, description)
1399
1400
1401 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1402     _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1403
1404     @staticmethod
1405     def _get_default(episode, key, default_key='default'):
1406         return try_get(episode, lambda x: x[key][default_key])
1407
1408     def _get_description(self, data):
1409         synopsis = data.get(self._DESCRIPTION_KEY) or {}
1410         return dict_get(synopsis, ('large', 'medium', 'small'))
1411
1412     def _fetch_page(self, programme_id, per_page, series_id, page):
1413         elements = self._get_elements(self._call_api(
1414             programme_id, per_page, page + 1, series_id))
1415         for element in elements:
1416             episode = self._get_episode(element)
1417             episode_id = episode.get('id')
1418             if not episode_id:
1419                 continue
1420             thumbnail = None
1421             image = self._get_episode_image(episode)
1422             if image:
1423                 thumbnail = image.replace('{recipe}', 'raw')
1424             category = self._get_default(episode, 'labels', 'category')
1425             yield {
1426                 '_type': 'url',
1427                 'id': episode_id,
1428                 'title': self._get_episode_field(episode, 'subtitle'),
1429                 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1430                 'thumbnail': thumbnail,
1431                 'description': self._get_description(episode),
1432                 'categories': [category] if category else None,
1433                 'series': self._get_episode_field(episode, 'title'),
1434                 'ie_key': BBCCoUkIE.ie_key(),
1435             }
1436
1437     def _real_extract(self, url):
1438         pid = self._match_id(url)
1439         qs = parse_qs(url)
1440         series_id = qs.get('seriesId', [None])[0]
1441         page = qs.get('page', [None])[0]
1442         per_page = 36 if page else self._PAGE_SIZE
1443         fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1444         entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1445         playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1446         return self.playlist_result(
1447             entries, pid, self._get_playlist_title(playlist_data),
1448             self._get_description(playlist_data))
1449
1450
1451 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1452     IE_NAME = 'bbc.co.uk:iplayer:episodes'
1453     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1454     _TESTS = [{
1455         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1456         'info_dict': {
1457             'id': 'b05rcz9v',
1458             'title': 'The Disappearance',
1459             'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1460         },
1461         'playlist_mincount': 8,
1462     }, {
1463         # all seasons
1464         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1465         'info_dict': {
1466             'id': 'b094m5t9',
1467             'title': 'Doctor Foster',
1468             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1469         },
1470         'playlist_mincount': 10,
1471     }, {
1472         # explicit season
1473         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1474         'info_dict': {
1475             'id': 'b094m5t9',
1476             'title': 'Doctor Foster',
1477             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1478         },
1479         'playlist_mincount': 5,
1480     }, {
1481         # all pages
1482         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1483         'info_dict': {
1484             'id': 'm0004c4v',
1485             'title': 'Beechgrove',
1486             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1487         },
1488         'playlist_mincount': 37,
1489     }, {
1490         # explicit page
1491         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1492         'info_dict': {
1493             'id': 'm0004c4v',
1494             'title': 'Beechgrove',
1495             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1496         },
1497         'playlist_mincount': 1,
1498     }]
1499     _PAGE_SIZE = 100
1500     _DESCRIPTION_KEY = 'synopsis'
1501
1502     def _get_episode_image(self, episode):
1503         return self._get_default(episode, 'image')
1504
1505     def _get_episode_field(self, episode, field):
1506         return self._get_default(episode, field)
1507
1508     @staticmethod
1509     def _get_elements(data):
1510         return data['entities']['results']
1511
1512     @staticmethod
1513     def _get_episode(element):
1514         return element.get('episode') or {}
1515
1516     def _call_api(self, pid, per_page, page=1, series_id=None):
1517         variables = {
1518             'id': pid,
1519             'page': page,
1520             'perPage': per_page,
1521         }
1522         if series_id:
1523             variables['sliceId'] = series_id
1524         return self._download_json(
1525             'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1526                 'Content-Type': 'application/json'
1527             }, data=json.dumps({
1528                 'id': '5692d93d5aac8d796a0305e895e61551',
1529                 'variables': variables,
1530             }).encode('utf-8'))['data']['programme']
1531
1532     @staticmethod
1533     def _get_playlist_data(data):
1534         return data
1535
1536     def _get_playlist_title(self, data):
1537         return self._get_default(data, 'title')
1538
1539
1540 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1541     IE_NAME = 'bbc.co.uk:iplayer:group'
1542     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1543     _TESTS = [{
1544         # Available for over a year unlike 30 days for most other programmes
1545         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1546         'info_dict': {
1547             'id': 'p02tcc32',
1548             'title': 'Bohemian Icons',
1549             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1550         },
1551         'playlist_mincount': 10,
1552     }, {
1553         # all pages
1554         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1555         'info_dict': {
1556             'id': 'p081d7j7',
1557             'title': 'Music in Scotland',
1558             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1559         },
1560         'playlist_mincount': 47,
1561     }, {
1562         # explicit page
1563         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1564         'info_dict': {
1565             'id': 'p081d7j7',
1566             'title': 'Music in Scotland',
1567             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1568         },
1569         'playlist_mincount': 11,
1570     }]
1571     _PAGE_SIZE = 200
1572     _DESCRIPTION_KEY = 'synopses'
1573
1574     def _get_episode_image(self, episode):
1575         return self._get_default(episode, 'images', 'standard')
1576
1577     def _get_episode_field(self, episode, field):
1578         return episode.get(field)
1579
1580     @staticmethod
1581     def _get_elements(data):
1582         return data['elements']
1583
1584     @staticmethod
1585     def _get_episode(element):
1586         return element
1587
1588     def _call_api(self, pid, per_page, page=1, series_id=None):
1589         return self._download_json(
1590             'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1591             pid, query={
1592                 'page': page,
1593                 'per_page': per_page,
1594             })['group_episodes']
1595
1596     @staticmethod
1597     def _get_playlist_data(data):
1598         return data['group']
1599
1600     def _get_playlist_title(self, data):
1601         return data.get('title')
1602
1603
1604 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1605     IE_NAME = 'bbc.co.uk:playlist'
1606     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1607     _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1608     _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1609     _TESTS = [{
1610         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1611         'info_dict': {
1612             'id': 'b05rcz9v',
1613             'title': 'The Disappearance - Clips - BBC Four',
1614             'description': 'French thriller serial about a missing teenager.',
1615         },
1616         'playlist_mincount': 7,
1617     }, {
1618         # multipage playlist, explicit page
1619         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1620         'info_dict': {
1621             'id': 'b00mfl7n',
1622             'title': 'Frozen Planet - Clips - BBC One',
1623             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1624         },
1625         'playlist_mincount': 24,
1626     }, {
1627         # multipage playlist, all pages
1628         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1629         'info_dict': {
1630             'id': 'b00mfl7n',
1631             'title': 'Frozen Planet - Clips - BBC One',
1632             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1633         },
1634         'playlist_mincount': 142,
1635     }, {
1636         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1637         'only_matching': True,
1638     }, {
1639         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1640         'only_matching': True,
1641     }, {
1642         'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1643         'only_matching': True,
1644     }]
1645
1646     def _extract_title_and_description(self, webpage):
1647         title = self._og_search_title(webpage, fatal=False)
1648         description = self._og_search_description(webpage)
1649         return title, description