yt_dlp/extractor/bbc.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import itertools
   6 import json
   7 import re
   8
   9 from .common import InfoExtractor
  10 from ..compat import (
  11     compat_etree_Element,
  12     compat_HTTPError,
  13     compat_str,
  14     compat_urllib_error,
  15     compat_urlparse,
  16 )
  17 from ..utils import (
  18     ExtractorError,
  19     OnDemandPagedList,
  20     clean_html,
  21     dict_get,
  22     float_or_none,
  23     get_element_by_class,
  24     int_or_none,
  25     js_to_json,
  26     parse_duration,
  27     parse_iso8601,
  28     parse_qs,
  29     strip_or_none,
  30     try_get,
  31     unescapeHTML,
  32     unified_timestamp,
  33     url_or_none,
  34     urlencode_postdata,
  35     urljoin,
  36 )
  37
  38
  39 class BBCCoUkIE(InfoExtractor):
  40     IE_NAME = 'bbc.co.uk'
  41     IE_DESC = 'BBC iPlayer'
  42     _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  43     _VALID_URL = r'''(?x)
  44                     https?://
  45                         (?:www\.)?bbc\.co\.uk/
  46                         (?:
  47                             programmes/(?!articles/)|
  48                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  49                             music/(?:clips|audiovideo/popular)[/#]|
  50                             radio/player/|
  51                             sounds/play/|
  52                             events/[^/]+/play/[^/]+/
  53                         )
  54                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  55                     ''' % _ID_REGEX
  56
  57     _LOGIN_URL = 'https://account.bbc.com/signin'
  58     _NETRC_MACHINE = 'bbc'
  59
  60     _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  61     _MEDIA_SETS = [
  62         # Provides HQ HLS streams with even better quality that pc mediaset but fails
  63         # with geolocation in some cases when it's even not geo restricted at all (e.g.
  64         # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  65         'iptv-all',
  66         'pc',
  67     ]
  68
  69     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  70
  71     _TESTS = [
  72         {
  73             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  74             'info_dict': {
  75                 'id': 'b039d07m',
  76                 'ext': 'flv',
  77                 'title': 'Kaleidoscope, Leonard Cohen',
  78                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  79             },
  80             'params': {
  81                 # rtmp download
  82                 'skip_download': True,
  83             }
  84         },
  85         {
  86             'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  87             'info_dict': {
  88                 'id': 'b00yng1d',
  89                 'ext': 'flv',
  90                 'title': 'The Man in Black: Series 3: The Printed Name',
  91                 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  92                 'duration': 1800,
  93             },
  94             'params': {
  95                 # rtmp download
  96                 'skip_download': True,
  97             },
  98             'skip': 'Episode is no longer available on BBC iPlayer Radio',
  99         },
 100         {
 101             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
 102             'info_dict': {
 103                 'id': 'b00yng1d',
 104                 'ext': 'flv',
 105                 'title': 'The Voice UK: Series 3: Blind Auditions 5',
 106                 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
 107                 'duration': 5100,
 108             },
 109             'params': {
 110                 # rtmp download
 111                 'skip_download': True,
 112             },
 113             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 114         },
 115         {
 116             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
 117             'info_dict': {
 118                 'id': 'b03k3pb7',
 119                 'ext': 'flv',
 120                 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
 121                 'description': '2. Invasion',
 122                 'duration': 3600,
 123             },
 124             'params': {
 125                 # rtmp download
 126                 'skip_download': True,
 127             },
 128             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 129         }, {
 130             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
 131             'info_dict': {
 132                 'id': 'b04v209v',
 133                 'ext': 'flv',
 134                 'title': 'Pete Tong, The Essential New Tune Special',
 135                 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
 136                 'duration': 10800,
 137             },
 138             'params': {
 139                 # rtmp download
 140                 'skip_download': True,
 141             },
 142             'skip': 'Episode is no longer available on BBC iPlayer Radio',
 143         }, {
 144             'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
 145             'note': 'Audio',
 146             'info_dict': {
 147                 'id': 'p022h44j',
 148                 'ext': 'flv',
 149                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
 150                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
 151                 'duration': 227,
 152             },
 153             'params': {
 154                 # rtmp download
 155                 'skip_download': True,
 156             }
 157         }, {
 158             'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
 159             'note': 'Video',
 160             'info_dict': {
 161                 'id': 'p025c103',
 162                 'ext': 'flv',
 163                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
 164                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
 165                 'duration': 226,
 166             },
 167             'params': {
 168                 # rtmp download
 169                 'skip_download': True,
 170             }
 171         }, {
 172             'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
 173             'info_dict': {
 174                 'id': 'p02n76xf',
 175                 'ext': 'flv',
 176                 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
 177                 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
 178                 'duration': 3540,
 179             },
 180             'params': {
 181                 # rtmp download
 182                 'skip_download': True,
 183             },
 184             'skip': 'geolocation',
 185         }, {
 186             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
 187             'info_dict': {
 188                 'id': 'b05zmgw1',
 189                 'ext': 'flv',
 190                 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
 191                 'title': 'Royal Academy Summer Exhibition',
 192                 'duration': 3540,
 193             },
 194             'params': {
 195                 # rtmp download
 196                 'skip_download': True,
 197             },
 198             'skip': 'geolocation',
 199         }, {
 200             # iptv-all mediaset fails with geolocation however there is no geo restriction
 201             # for this programme at all
 202             'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
 203             'info_dict': {
 204                 'id': 'b06rkms3',
 205                 'ext': 'flv',
 206                 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
 207                 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
 208             },
 209             'params': {
 210                 # rtmp download
 211                 'skip_download': True,
 212             },
 213             'skip': 'Now it\'s really geo-restricted',
 214         }, {
 215             # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
 216             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
 217             'info_dict': {
 218                 'id': 'p028bfkj',
 219                 'ext': 'flv',
 220                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 221                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 222             },
 223             'params': {
 224                 # rtmp download
 225                 'skip_download': True,
 226             },
 227         }, {
 228             'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
 229             'note': 'Audio',
 230             'info_dict': {
 231                 'id': 'm0007jz9',
 232                 'ext': 'mp4',
 233                 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
 234                 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
 235                 'duration': 9840,
 236             },
 237             'params': {
 238                 # rtmp download
 239                 'skip_download': True,
 240             }
 241         }, {
 242             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
 243             'only_matching': True,
 244         }, {
 245             'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
 246             'only_matching': True,
 247         }, {
 248             'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
 249             'only_matching': True,
 250         }, {
 251             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
 252             'only_matching': True,
 253         }, {
 254             'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
 255             'only_matching': True,
 256         }, {
 257             'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
 258             'only_matching': True,
 259         }, {
 260             'url': 'https://www.bbc.co.uk/programmes/m00005xn',
 261             'only_matching': True,
 262         }, {
 263             'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
 264             'only_matching': True,
 265         }]
 266
 267     def _login(self):
 268         username, password = self._get_login_info()
 269         if username is None:
 270             return
 271
 272         login_page = self._download_webpage(
 273             self._LOGIN_URL, None, 'Downloading signin page')
 274
 275         login_form = self._hidden_inputs(login_page)
 276
 277         login_form.update({
 278             'username': username,
 279             'password': password,
 280         })
 281
 282         post_url = urljoin(self._LOGIN_URL, self._search_regex(
 283             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
 284             'post url', default=self._LOGIN_URL, group='url'))
 285
 286         response, urlh = self._download_webpage_handle(
 287             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
 288             headers={'Referer': self._LOGIN_URL})
 289
 290         if self._LOGIN_URL in urlh.geturl():
 291             error = clean_html(get_element_by_class('form-message', response))
 292             if error:
 293                 raise ExtractorError(
 294                     'Unable to login: %s' % error, expected=True)
 295             raise ExtractorError('Unable to log in')
 296
 297     def _real_initialize(self):
 298         self._login()
 299
 300     class MediaSelectionError(Exception):
 301         def __init__(self, id):
 302             self.id = id
 303
 304     def _extract_asx_playlist(self, connection, programme_id):
 305         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
 306         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 307
 308     def _extract_items(self, playlist):
 309         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 310
 311     def _extract_medias(self, media_selection):
 312         error = media_selection.get('result')
 313         if error:
 314             raise BBCCoUkIE.MediaSelectionError(error)
 315         return media_selection.get('media') or []
 316
 317     def _extract_connections(self, media):
 318         return media.get('connection') or []
 319
 320     def _get_subtitles(self, media, programme_id):
 321         subtitles = {}
 322         for connection in self._extract_connections(media):
 323             cc_url = url_or_none(connection.get('href'))
 324             if not cc_url:
 325                 continue
 326             captions = self._download_xml(
 327                 cc_url, programme_id, 'Downloading captions', fatal=False)
 328             if not isinstance(captions, compat_etree_Element):
 329                 continue
 330             subtitles['en'] = [
 331                 {
 332                     'url': connection.get('href'),
 333                     'ext': 'ttml',
 334                 },
 335             ]
 336             break
 337         return subtitles
 338
 339     def _raise_extractor_error(self, media_selection_error):
 340         raise ExtractorError(
 341             '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
 342             expected=True)
 343
 344     def _download_media_selector(self, programme_id):
 345         last_exception = None
 346         for media_set in self._MEDIA_SETS:
 347             try:
 348                 return self._download_media_selector_url(
 349                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
 350             except BBCCoUkIE.MediaSelectionError as e:
 351                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
 352                     last_exception = e
 353                     continue
 354                 self._raise_extractor_error(e)
 355         self._raise_extractor_error(last_exception)
 356
 357     def _download_media_selector_url(self, url, programme_id=None):
 358         media_selection = self._download_json(
 359             url, programme_id, 'Downloading media selection JSON',
 360             expected_status=(403, 404))
 361         return self._process_media_selector(media_selection, programme_id)
 362
 363     def _process_media_selector(self, media_selection, programme_id):
 364         formats = []
 365         subtitles = None
 366         urls = []
 367
 368         for media in self._extract_medias(media_selection):
 369             kind = media.get('kind')
 370             if kind in ('video', 'audio'):
 371                 bitrate = int_or_none(media.get('bitrate'))
 372                 encoding = media.get('encoding')
 373                 width = int_or_none(media.get('width'))
 374                 height = int_or_none(media.get('height'))
 375                 file_size = int_or_none(media.get('media_file_size'))
 376                 for connection in self._extract_connections(media):
 377                     href = connection.get('href')
 378                     if href in urls:
 379                         continue
 380                     if href:
 381                         urls.append(href)
 382                     conn_kind = connection.get('kind')
 383                     protocol = connection.get('protocol')
 384                     supplier = connection.get('supplier')
 385                     transfer_format = connection.get('transferFormat')
 386                     format_id = supplier or conn_kind or protocol
 387                     # ASX playlist
 388                     if supplier == 'asx':
 389                         for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
 390                             formats.append({
 391                                 'url': ref,
 392                                 'format_id': 'ref%s_%s' % (i, format_id),
 393                             })
 394                     elif transfer_format == 'dash':
 395                         formats.extend(self._extract_mpd_formats(
 396                             href, programme_id, mpd_id=format_id, fatal=False))
 397                     elif transfer_format == 'hls':
 398                         # TODO: let expected_status be passed into _extract_xxx_formats() instead
 399                         try:
 400                             fmts = self._extract_m3u8_formats(
 401                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
 402                                 m3u8_id=format_id, fatal=False)
 403                         except ExtractorError as e:
 404                             if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
 405                                     and e.exc_info[1].code in (403, 404)):
 406                                 raise
 407                             fmts = []
 408                         formats.extend(fmts)
 409                     elif transfer_format == 'hds':
 410                         formats.extend(self._extract_f4m_formats(
 411                             href, programme_id, f4m_id=format_id, fatal=False))
 412                     else:
 413                         if not supplier and bitrate:
 414                             format_id += '-%d' % bitrate
 415                         fmt = {
 416                             'format_id': format_id,
 417                             'filesize': file_size,
 418                         }
 419                         if kind == 'video':
 420                             fmt.update({
 421                                 'width': width,
 422                                 'height': height,
 423                                 'tbr': bitrate,
 424                                 'vcodec': encoding,
 425                             })
 426                         else:
 427                             fmt.update({
 428                                 'abr': bitrate,
 429                                 'acodec': encoding,
 430                                 'vcodec': 'none',
 431                             })
 432                         if protocol in ('http', 'https'):
 433                             # Direct link
 434                             fmt.update({
 435                                 'url': href,
 436                             })
 437                         elif protocol == 'rtmp':
 438                             application = connection.get('application', 'ondemand')
 439                             auth_string = connection.get('authString')
 440                             identifier = connection.get('identifier')
 441                             server = connection.get('server')
 442                             fmt.update({
 443                                 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
 444                                 'play_path': identifier,
 445                                 'app': '%s?%s' % (application, auth_string),
 446                                 'page_url': 'http://www.bbc.co.uk',
 447                                 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
 448                                 'rtmp_live': False,
 449                                 'ext': 'flv',
 450                             })
 451                         else:
 452                             continue
 453                         formats.append(fmt)
 454             elif kind == 'captions':
 455                 subtitles = self.extract_subtitles(media, programme_id)
 456         return formats, subtitles
 457
 458     def _download_playlist(self, playlist_id):
 459         try:
 460             playlist = self._download_json(
 461                 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
 462                 playlist_id, 'Downloading playlist JSON')
 463             formats = []
 464             subtitles = {}
 465
 466             for version in playlist.get('allAvailableVersions', []):
 467                 smp_config = version['smpConfig']
 468                 title = smp_config['title']
 469                 description = smp_config['summary']
 470                 for item in smp_config['items']:
 471                     kind = item['kind']
 472                     if kind not in ('programme', 'radioProgramme'):
 473                         continue
 474                     programme_id = item.get('vpid')
 475                     duration = int_or_none(item.get('duration'))
 476                     version_formats, version_subtitles = self._download_media_selector(programme_id)
 477                     types = version['types']
 478                     for f in version_formats:
 479                         f['format_note'] = ', '.join(types)
 480                         if any('AudioDescribed' in x for x in types):
 481                             f['language_preference'] = -10
 482                     formats += version_formats
 483                     for tag, subformats in (version_subtitles or {}).items():
 484                         subtitles.setdefault(tag, []).extend(subformats)
 485
 486             return programme_id, title, description, duration, formats, subtitles
 487         except ExtractorError as ee:
 488             if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
 489                 raise
 490
 491         # fallback to legacy playlist
 492         return self._process_legacy_playlist(playlist_id)
 493
 494     def _process_legacy_playlist_url(self, url, display_id):
 495         playlist = self._download_legacy_playlist_url(url, display_id)
 496         return self._extract_from_legacy_playlist(playlist, display_id)
 497
 498     def _process_legacy_playlist(self, playlist_id):
 499         return self._process_legacy_playlist_url(
 500             'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
 501
 502     def _download_legacy_playlist_url(self, url, playlist_id=None):
 503         return self._download_xml(
 504             url, playlist_id, 'Downloading legacy playlist XML')
 505
 506     def _extract_from_legacy_playlist(self, playlist, playlist_id):
 507         no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
 508         if no_items is not None:
 509             reason = no_items.get('reason')
 510             if reason == 'preAvailability':
 511                 msg = 'Episode %s is not yet available' % playlist_id
 512             elif reason == 'postAvailability':
 513                 msg = 'Episode %s is no longer available' % playlist_id
 514             elif reason == 'noMedia':
 515                 msg = 'Episode %s is not currently available' % playlist_id
 516             else:
 517                 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
 518             raise ExtractorError(msg, expected=True)
 519
 520         for item in self._extract_items(playlist):
 521             kind = item.get('kind')
 522             if kind not in ('programme', 'radioProgramme'):
 523                 continue
 524             title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
 525             description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
 526             description = description_el.text if description_el is not None else None
 527
 528             def get_programme_id(item):
 529                 def get_from_attributes(item):
 530                     for p in ('identifier', 'group'):
 531                         value = item.get(p)
 532                         if value and re.match(r'^[pb][\da-z]{7}$', value):
 533                             return value
 534                 get_from_attributes(item)
 535                 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
 536                 if mediator is not None:
 537                     return get_from_attributes(mediator)
 538
 539             programme_id = get_programme_id(item)
 540             duration = int_or_none(item.get('duration'))
 541
 542             if programme_id:
 543                 formats, subtitles = self._download_media_selector(programme_id)
 544             else:
 545                 formats, subtitles = self._process_media_selector(item, playlist_id)
 546                 programme_id = playlist_id
 547
 548         return programme_id, title, description, duration, formats, subtitles
 549
 550     def _real_extract(self, url):
 551         group_id = self._match_id(url)
 552
 553         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 554
 555         error = self._search_regex(
 556             r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
 557             webpage, 'error', default=None)
 558         if error:
 559             raise ExtractorError(error, expected=True)
 560
 561         programme_id = None
 562         duration = None
 563
 564         tviplayer = self._search_regex(
 565             r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
 566             webpage, 'player', default=None)
 567
 568         if tviplayer:
 569             player = self._parse_json(tviplayer, group_id).get('player', {})
 570             duration = int_or_none(player.get('duration'))
 571             programme_id = player.get('vpid')
 572
 573         if not programme_id:
 574             programme_id = self._search_regex(
 575                 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 576
 577         if programme_id:
 578             formats, subtitles = self._download_media_selector(programme_id)
 579             title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 580                 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
 581                  r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
 582             description = self._search_regex(
 583                 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
 584                  r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
 585                 webpage, 'description', default=None)
 586             if not description:
 587                 description = self._html_search_meta('description', webpage)
 588         else:
 589             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 590
 591         self._sort_formats(formats)
 592
 593         return {
 594             'id': programme_id,
 595             'title': title,
 596             'description': description,
 597             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 598             'duration': duration,
 599             'formats': formats,
 600             'subtitles': subtitles,
 601         }
 602
 603
 604 class BBCIE(BBCCoUkIE):
 605     IE_NAME = 'bbc'
 606     IE_DESC = 'BBC'
 607     _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
 608
 609     _MEDIA_SETS = [
 610         'pc',
 611         'mobile-tablet-main',
 612     ]
 613
 614     _TESTS = [{
 615         # article with multiple videos embedded with data-playable containing vpids
 616         'url': 'http://www.bbc.com/news/world-europe-32668511',
 617         'info_dict': {
 618             'id': 'world-europe-32668511',
 619             'title': 'Russia stages massive WW2 parade',
 620             'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
 621         },
 622         'playlist_count': 2,
 623     }, {
 624         # article with multiple videos embedded with data-playable (more videos)
 625         'url': 'http://www.bbc.com/news/business-28299555',
 626         'info_dict': {
 627             'id': 'business-28299555',
 628             'title': 'Farnborough Airshow: Video highlights',
 629             'description': 'BBC reports and video highlights at the Farnborough Airshow.',
 630         },
 631         'playlist_count': 9,
 632         'skip': 'Save time',
 633     }, {
 634         # article with multiple videos embedded with `new SMP()`
 635         # broken
 636         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
 637         'info_dict': {
 638             'id': '3662a707-0af9-3149-963f-47bea720b460',
 639             'title': 'BUGGER',
 640         },
 641         'playlist_count': 18,
 642     }, {
 643         # single video embedded with data-playable containing vpid
 644         'url': 'http://www.bbc.com/news/world-europe-32041533',
 645         'info_dict': {
 646             'id': 'p02mprgb',
 647             'ext': 'mp4',
 648             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 649             'description': 'md5:2868290467291b37feda7863f7a83f54',
 650             'duration': 47,
 651             'timestamp': 1427219242,
 652             'upload_date': '20150324',
 653         },
 654         'params': {
 655             # rtmp download
 656             'skip_download': True,
 657         }
 658     }, {
 659         # article with single video embedded with data-playable containing XML playlist
 660         # with direct video links as progressiveDownloadUrl (for now these are extracted)
 661         # and playlist with f4m and m3u8 as streamingUrl
 662         'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 663         'info_dict': {
 664             'id': '150615_telabyad_kentin_cogu',
 665             'ext': 'mp4',
 666             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
 667             'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
 668             'timestamp': 1434397334,
 669             'upload_date': '20150615',
 670         },
 671         'params': {
 672             'skip_download': True,
 673         }
 674     }, {
 675         # single video embedded with data-playable containing XML playlists (regional section)
 676         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 677         'info_dict': {
 678             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
 679             'ext': 'mp4',
 680             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
 681             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
 682             'timestamp': 1434713142,
 683             'upload_date': '20150619',
 684         },
 685         'params': {
 686             'skip_download': True,
 687         }
 688     }, {
 689         # single video from video playlist embedded with vxp-playlist-data JSON
 690         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
 691         'info_dict': {
 692             'id': 'p02w6qjc',
 693             'ext': 'mp4',
 694             'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 695             'duration': 56,
 696             'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 697         },
 698         'params': {
 699             'skip_download': True,
 700         }
 701     }, {
 702         # single video story with digitalData
 703         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
 704         'info_dict': {
 705             'id': 'p02q6gc4',
 706             'ext': 'flv',
 707             'title': 'Sri Lanka’s spicy secret',
 708             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
 709             'timestamp': 1437674293,
 710             'upload_date': '20150723',
 711         },
 712         'params': {
 713             # rtmp download
 714             'skip_download': True,
 715         }
 716     }, {
 717         # single video story without digitalData
 718         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
 719         'info_dict': {
 720             'id': 'p018zqqg',
 721             'ext': 'mp4',
 722             'title': 'Hyundai Santa Fe Sport: Rock star',
 723             'description': 'md5:b042a26142c4154a6e472933cf20793d',
 724             'timestamp': 1415867444,
 725             'upload_date': '20141113',
 726         },
 727         'params': {
 728             # rtmp download
 729             'skip_download': True,
 730         }
 731     }, {
 732         # single video embedded with Morph
 733         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
 734         'info_dict': {
 735             'id': 'p041vhd0',
 736             'ext': 'mp4',
 737             'title': "Nigeria v Japan - Men's First Round",
 738             'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
 739             'duration': 7980,
 740             'uploader': 'BBC Sport',
 741             'uploader_id': 'bbc_sport',
 742         },
 743         'params': {
 744             # m3u8 download
 745             'skip_download': True,
 746         },
 747         'skip': 'Georestricted to UK',
 748     }, {
 749         # single video with playlist.sxml URL in playlist param
 750         'url': 'http://www.bbc.com/sport/0/football/33653409',
 751         'info_dict': {
 752             'id': 'p02xycnp',
 753             'ext': 'mp4',
 754             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
 755             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
 756             'duration': 140,
 757         },
 758         'params': {
 759             # rtmp download
 760             'skip_download': True,
 761         }
 762     }, {
 763         # article with multiple videos embedded with playlist.sxml in playlist param
 764         'url': 'http://www.bbc.com/sport/0/football/34475836',
 765         'info_dict': {
 766             'id': '34475836',
 767             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
 768             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
 769         },
 770         'playlist_count': 3,
 771     }, {
 772         # school report article with single video
 773         'url': 'http://www.bbc.co.uk/schoolreport/35744779',
 774         'info_dict': {
 775             'id': '35744779',
 776             'title': 'School which breaks down barriers in Jerusalem',
 777         },
 778         'playlist_count': 1,
 779     }, {
 780         # single video with playlist URL from weather section
 781         'url': 'http://www.bbc.com/weather/features/33601775',
 782         'only_matching': True,
 783     }, {
 784         # custom redirection to www.bbc.com
 785         # also, video with window.__INITIAL_DATA__
 786         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
 787         'info_dict': {
 788             'id': 'p02xzws1',
 789             'ext': 'mp4',
 790             'title': "Pluto may have 'nitrogen glaciers'",
 791             'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
 792             'thumbnail': r're:https?://.+/.+\.jpg',
 793             'timestamp': 1437785037,
 794             'upload_date': '20150725',
 795         },
 796     }, {
 797         # video with window.__INITIAL_DATA__ and value as JSON string
 798         'url': 'https://www.bbc.com/news/av/world-europe-59468682',
 799         'info_dict': {
 800             'id': 'p0b71qth',
 801             'ext': 'mp4',
 802             'title': 'Why France is making this woman a national hero',
 803             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
 804             'thumbnail': r're:https?://.+/.+\.jpg',
 805             'timestamp': 1638230731,
 806             'upload_date': '20211130',
 807         },
 808     }, {
 809         # single video article embedded with data-media-vpid
 810         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
 811         'only_matching': True,
 812     }, {
 813         # bbcthreeConfig
 814         'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
 815         'info_dict': {
 816             'id': 'p06556y7',
 817             'ext': 'mp4',
 818             'title': 'Things Not To Say to people that live on council estates',
 819             'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
 820             'duration': 360,
 821             'thumbnail': r're:https?://.+/.+\.jpg',
 822         },
 823     }, {
 824         # window.__PRELOADED_STATE__
 825         'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
 826         'info_dict': {
 827             'id': 'b0b9z4vz',
 828             'ext': 'mp4',
 829             'title': 'Prom 6: An American in Paris and Turangalila',
 830             'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
 831             'uploader': 'Radio 3',
 832             'uploader_id': 'bbc_radio_three',
 833         },
 834     }, {
 835         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
 836         'info_dict': {
 837             'id': 'p06w9tws',
 838             'ext': 'mp4',
 839             'title': 'md5:2fabf12a726603193a2879a055f72514',
 840             'description': 'Learn English words and phrases from this story',
 841         },
 842         'add_ie': [BBCCoUkIE.ie_key()],
 843     }, {
 844         # BBC Reel
 845         'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
 846         'info_dict': {
 847             'id': 'p07c6sb9',
 848             'ext': 'mp4',
 849             'title': 'How positive thinking is harming your happiness',
 850             'alt_title': 'The downsides of positive thinking',
 851             'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
 852             'duration': 235,
 853             'thumbnail': r're:https?://.+/p07c9dsr.jpg',
 854             'upload_date': '20190604',
 855             'categories': ['Psychology'],
 856         },
 857     }]
 858
 859     @classmethod
 860     def suitable(cls, url):
 861         EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
 862         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
 863                 else super(BBCIE, cls).suitable(url))
 864
 865     def _extract_from_media_meta(self, media_meta, video_id):
 866         # Direct links to media in media metadata (e.g.
 867         # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 868         # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
 869         source_files = media_meta.get('sourceFiles')
 870         if source_files:
 871             return [{
 872                 'url': f['url'],
 873                 'format_id': format_id,
 874                 'ext': f.get('encoding'),
 875                 'tbr': float_or_none(f.get('bitrate'), 1000),
 876                 'filesize': int_or_none(f.get('filesize')),
 877             } for format_id, f in source_files.items() if f.get('url')], []
 878
 879         programme_id = media_meta.get('externalId')
 880         if programme_id:
 881             return self._download_media_selector(programme_id)
 882
 883         # Process playlist.sxml as legacy playlist
 884         href = media_meta.get('href')
 885         if href:
 886             playlist = self._download_legacy_playlist_url(href)
 887             _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
 888             return formats, subtitles
 889
 890         return [], []
 891
 892     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
 893         programme_id, title, description, duration, formats, subtitles = \
 894             self._process_legacy_playlist_url(url, playlist_id)
 895         self._sort_formats(formats)
 896         return {
 897             'id': programme_id,
 898             'title': title,
 899             'description': description,
 900             'duration': duration,
 901             'timestamp': timestamp,
 902             'formats': formats,
 903             'subtitles': subtitles,
 904         }
 905
 906     def _real_extract(self, url):
 907         playlist_id = self._match_id(url)
 908
 909         webpage = self._download_webpage(url, playlist_id)
 910
 911         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
 912         timestamp = json_ld_info.get('timestamp')
 913
 914         playlist_title = json_ld_info.get('title')
 915         if not playlist_title:
 916             playlist_title = self._og_search_title(
 917                 webpage, default=None) or self._html_search_regex(
 918                 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
 919             if playlist_title:
 920                 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
 921
 922         playlist_description = json_ld_info.get(
 923             'description') or self._og_search_description(webpage, default=None)
 924
 925         if not timestamp:
 926             timestamp = parse_iso8601(self._search_regex(
 927                 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
 928                  r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
 929                  r'"datePublished":\s*"([^"]+)'],
 930                 webpage, 'date', default=None))
 931
 932         entries = []
 933
 934         # article with multiple videos embedded with playlist.sxml (e.g.
 935         # http://www.bbc.com/sport/0/football/34475836)
 936         playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
 937         playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
 938         if playlists:
 939             entries = [
 940                 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
 941                 for playlist_url in playlists]
 942
 943         # news article with multiple videos embedded with data-playable
 944         data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
 945         if data_playables:
 946             for _, data_playable_json in data_playables:
 947                 data_playable = self._parse_json(
 948                     unescapeHTML(data_playable_json), playlist_id, fatal=False)
 949                 if not data_playable:
 950                     continue
 951                 settings = data_playable.get('settings', {})
 952                 if settings:
 953                     # data-playable with video vpid in settings.playlistObject.items (e.g.
 954                     # http://www.bbc.com/news/world-us-canada-34473351)
 955                     playlist_object = settings.get('playlistObject', {})
 956                     if playlist_object:
 957                         items = playlist_object.get('items')
 958                         if items and isinstance(items, list):
 959                             title = playlist_object['title']
 960                             description = playlist_object.get('summary')
 961                             duration = int_or_none(items[0].get('duration'))
 962                             programme_id = items[0].get('vpid')
 963                             formats, subtitles = self._download_media_selector(programme_id)
 964                             self._sort_formats(formats)
 965                             entries.append({
 966                                 'id': programme_id,
 967                                 'title': title,
 968                                 'description': description,
 969                                 'timestamp': timestamp,
 970                                 'duration': duration,
 971                                 'formats': formats,
 972                                 'subtitles': subtitles,
 973                             })
 974                     else:
 975                         # data-playable without vpid but with a playlist.sxml URLs
 976                         # in otherSettings.playlist (e.g.
 977                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
 978                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
 979                         if playlist:
 980                             entry = None
 981                             for key in ('streaming', 'progressiveDownload'):
 982                                 playlist_url = playlist.get('%sUrl' % key)
 983                                 if not playlist_url:
 984                                     continue
 985                                 try:
 986                                     info = self._extract_from_playlist_sxml(
 987                                         playlist_url, playlist_id, timestamp)
 988                                     if not entry:
 989                                         entry = info
 990                                     else:
 991                                         entry['title'] = info['title']
 992                                         entry['formats'].extend(info['formats'])
 993                                 except ExtractorError as e:
 994                                     # Some playlist URL may fail with 500, at the same time
 995                                     # the other one may work fine (e.g.
 996                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 997                                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
 998                                         continue
 999                                     raise
1000                             if entry:
1001                                 self._sort_formats(entry['formats'])
1002                                 entries.append(entry)
1003
1004         if entries:
1005             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1006
1007         # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
1008         group_id = self._search_regex(
1009             r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
1010             webpage, 'group id', default=None)
1011         if group_id:
1012             return self.url_result(
1013                 'https://www.bbc.co.uk/programmes/%s' % group_id,
1014                 ie=BBCCoUkIE.ie_key())
1015
1016         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1017         programme_id = self._search_regex(
1018             [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1019              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1020              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1021             webpage, 'vpid', default=None)
1022
1023         if programme_id:
1024             formats, subtitles = self._download_media_selector(programme_id)
1025             self._sort_formats(formats)
1026             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1027             digital_data = self._parse_json(
1028                 self._search_regex(
1029                     r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1030                 programme_id, fatal=False)
1031             page_info = digital_data.get('page', {}).get('pageInfo', {})
1032             title = page_info.get('pageName') or self._og_search_title(webpage)
1033             description = page_info.get('description') or self._og_search_description(webpage)
1034             timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1035             return {
1036                 'id': programme_id,
1037                 'title': title,
1038                 'description': description,
1039                 'timestamp': timestamp,
1040                 'formats': formats,
1041                 'subtitles': subtitles,
1042             }
1043
1044         # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1045         initial_data = self._parse_json(self._html_search_regex(
1046             r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1047             webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1048         if initial_data:
1049             init_data = try_get(
1050                 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1051             smp_data = init_data.get('smpData') or {}
1052             clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1053             version_id = clip_data.get('versionID')
1054             if version_id:
1055                 title = smp_data['title']
1056                 formats, subtitles = self._download_media_selector(version_id)
1057                 self._sort_formats(formats)
1058                 image_url = smp_data.get('holdingImageURL')
1059                 display_date = init_data.get('displayDate')
1060                 topic_title = init_data.get('topicTitle')
1061
1062                 return {
1063                     'id': version_id,
1064                     'title': title,
1065                     'formats': formats,
1066                     'alt_title': init_data.get('shortTitle'),
1067                     'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1068                     'description': smp_data.get('summary') or init_data.get('shortSummary'),
1069                     'upload_date': display_date.replace('-', '') if display_date else None,
1070                     'subtitles': subtitles,
1071                     'duration': int_or_none(clip_data.get('duration')),
1072                     'categories': [topic_title] if topic_title else None,
1073                 }
1074
1075         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1076         # There are several setPayload calls may be present but the video
1077         # seems to be always related to the first one
1078         morph_payload = self._parse_json(
1079             self._search_regex(
1080                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1081                 webpage, 'morph payload', default='{}'),
1082             playlist_id, fatal=False)
1083         if morph_payload:
1084             components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1085             for component in components:
1086                 if not isinstance(component, dict):
1087                     continue
1088                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1089                 if not lead_media:
1090                     continue
1091                 identifiers = lead_media.get('identifiers')
1092                 if not identifiers or not isinstance(identifiers, dict):
1093                     continue
1094                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1095                 if not programme_id:
1096                     continue
1097                 title = lead_media.get('title') or self._og_search_title(webpage)
1098                 formats, subtitles = self._download_media_selector(programme_id)
1099                 self._sort_formats(formats)
1100                 description = lead_media.get('summary')
1101                 uploader = lead_media.get('masterBrand')
1102                 uploader_id = lead_media.get('mid')
1103                 duration = None
1104                 duration_d = lead_media.get('duration')
1105                 if isinstance(duration_d, dict):
1106                     duration = parse_duration(dict_get(
1107                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1108                 return {
1109                     'id': programme_id,
1110                     'title': title,
1111                     'description': description,
1112                     'duration': duration,
1113                     'uploader': uploader,
1114                     'uploader_id': uploader_id,
1115                     'formats': formats,
1116                     'subtitles': subtitles,
1117                 }
1118
1119         preload_state = self._parse_json(self._search_regex(
1120             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1121             'preload state', default='{}'), playlist_id, fatal=False)
1122         if preload_state:
1123             current_programme = preload_state.get('programmes', {}).get('current') or {}
1124             programme_id = current_programme.get('id')
1125             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1126                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1127                 formats, subtitles = self._download_media_selector(programme_id)
1128                 self._sort_formats(formats)
1129                 synopses = current_programme.get('synopses') or {}
1130                 network = current_programme.get('network') or {}
1131                 duration = int_or_none(
1132                     current_programme.get('duration', {}).get('value'))
1133                 thumbnail = None
1134                 image_url = current_programme.get('image_url')
1135                 if image_url:
1136                     thumbnail = image_url.replace('{recipe}', 'raw')
1137                 return {
1138                     'id': programme_id,
1139                     'title': title,
1140                     'description': dict_get(synopses, ('long', 'medium', 'short')),
1141                     'thumbnail': thumbnail,
1142                     'duration': duration,
1143                     'uploader': network.get('short_title'),
1144                     'uploader_id': network.get('id'),
1145                     'formats': formats,
1146                     'subtitles': subtitles,
1147                 }
1148
1149         bbc3_config = self._parse_json(
1150             self._search_regex(
1151                 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1152                 'bbcthree config', default='{}'),
1153             playlist_id, transform_source=js_to_json, fatal=False) or {}
1154         payload = bbc3_config.get('payload') or {}
1155         if payload:
1156             clip = payload.get('currentClip') or {}
1157             clip_vpid = clip.get('vpid')
1158             clip_title = clip.get('title')
1159             if clip_vpid and clip_title:
1160                 formats, subtitles = self._download_media_selector(clip_vpid)
1161                 self._sort_formats(formats)
1162                 return {
1163                     'id': clip_vpid,
1164                     'title': clip_title,
1165                     'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1166                     'description': clip.get('description'),
1167                     'duration': parse_duration(clip.get('duration')),
1168                     'formats': formats,
1169                     'subtitles': subtitles,
1170                 }
1171             bbc3_playlist = try_get(
1172                 payload, lambda x: x['content']['bbcMedia']['playlist'],
1173                 dict)
1174             if bbc3_playlist:
1175                 playlist_title = bbc3_playlist.get('title') or playlist_title
1176                 thumbnail = bbc3_playlist.get('holdingImageURL')
1177                 entries = []
1178                 for bbc3_item in bbc3_playlist['items']:
1179                     programme_id = bbc3_item.get('versionID')
1180                     if not programme_id:
1181                         continue
1182                     formats, subtitles = self._download_media_selector(programme_id)
1183                     self._sort_formats(formats)
1184                     entries.append({
1185                         'id': programme_id,
1186                         'title': playlist_title,
1187                         'thumbnail': thumbnail,
1188                         'timestamp': timestamp,
1189                         'formats': formats,
1190                         'subtitles': subtitles,
1191                     })
1192                 return self.playlist_result(
1193                     entries, playlist_id, playlist_title, playlist_description)
1194
1195         initial_data = self._search_regex(
1196             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1197             'quoted preload state', default=None)
1198         if initial_data is None:
1199             initial_data = self._search_regex(
1200                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1201                 'preload state', default={})
1202         else:
1203             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1204         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1205         if initial_data:
1206             def parse_media(media):
1207                 if not media:
1208                     return
1209                 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1210                     item_id = item.get('id')
1211                     item_title = item.get('title')
1212                     if not (item_id and item_title):
1213                         continue
1214                     formats, subtitles = self._download_media_selector(item_id)
1215                     self._sort_formats(formats)
1216                     item_desc = None
1217                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1218                     if blocks:
1219                         summary = []
1220                         for block in blocks:
1221                             text = try_get(block, lambda x: x['model']['text'], compat_str)
1222                             if text:
1223                                 summary.append(text)
1224                         if summary:
1225                             item_desc = '\n\n'.join(summary)
1226                     item_time = None
1227                     for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1228                         if try_get(meta, lambda x: x['label']) == 'Published':
1229                             item_time = unified_timestamp(meta.get('timestamp'))
1230                             break
1231                     entries.append({
1232                         'id': item_id,
1233                         'title': item_title,
1234                         'thumbnail': item.get('holdingImageUrl'),
1235                         'formats': formats,
1236                         'subtitles': subtitles,
1237                         'timestamp': item_time,
1238                         'description': strip_or_none(item_desc),
1239                     })
1240             for resp in (initial_data.get('data') or {}).values():
1241                 name = resp.get('name')
1242                 if name == 'media-experience':
1243                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1244                 elif name == 'article':
1245                     for block in (try_get(resp,
1246                                           (lambda x: x['data']['blocks'],
1247                                            lambda x: x['data']['content']['model']['blocks'],),
1248                                           list) or []):
1249                         if block.get('type') != 'media':
1250                             continue
1251                         parse_media(block.get('model'))
1252             return self.playlist_result(
1253                 entries, playlist_id, playlist_title, playlist_description)
1254
1255         def extract_all(pattern):
1256             return list(filter(None, map(
1257                 lambda s: self._parse_json(s, playlist_id, fatal=False),
1258                 re.findall(pattern, webpage))))
1259
1260         # Multiple video article (e.g.
1261         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1262         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1263         entries = []
1264         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1265             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1266             if embed_url and re.match(EMBED_URL, embed_url):
1267                 entries.append(embed_url)
1268         entries.extend(re.findall(
1269             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1270         if entries:
1271             return self.playlist_result(
1272                 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1273                 playlist_id, playlist_title, playlist_description)
1274
1275         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1276         medias = extract_all(r"data-media-meta='({[^']+})'")
1277
1278         if not medias:
1279             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1280             media_asset = self._search_regex(
1281                 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1282                 webpage, 'media asset', default=None)
1283             if media_asset:
1284                 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1285                 medias = []
1286                 for video in media_asset_page.get('videos', {}).values():
1287                     medias.extend(video.values())
1288
1289         if not medias:
1290             # Multiple video playlist with single `now playing` entry (e.g.
1291             # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1292             vxp_playlist = self._parse_json(
1293                 self._search_regex(
1294                     r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1295                     webpage, 'playlist data'),
1296                 playlist_id)
1297             playlist_medias = []
1298             for item in vxp_playlist:
1299                 media = item.get('media')
1300                 if not media:
1301                     continue
1302                 playlist_medias.append(media)
1303                 # Download single video if found media with asset id matching the video id from URL
1304                 if item.get('advert', {}).get('assetId') == playlist_id:
1305                     medias = [media]
1306                     break
1307             # Fallback to the whole playlist
1308             if not medias:
1309                 medias = playlist_medias
1310
1311         entries = []
1312         for num, media_meta in enumerate(medias, start=1):
1313             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1314             if not formats and not self.get_param('ignore_no_formats'):
1315                 continue
1316             self._sort_formats(formats)
1317
1318             video_id = media_meta.get('externalId')
1319             if not video_id:
1320                 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1321
1322             title = media_meta.get('caption')
1323             if not title:
1324                 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1325
1326             duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1327
1328             images = []
1329             for image in media_meta.get('images', {}).values():
1330                 images.extend(image.values())
1331             if 'image' in media_meta:
1332                 images.append(media_meta['image'])
1333
1334             thumbnails = [{
1335                 'url': image.get('href'),
1336                 'width': int_or_none(image.get('width')),
1337                 'height': int_or_none(image.get('height')),
1338             } for image in images]
1339
1340             entries.append({
1341                 'id': video_id,
1342                 'title': title,
1343                 'thumbnails': thumbnails,
1344                 'duration': duration,
1345                 'timestamp': timestamp,
1346                 'formats': formats,
1347                 'subtitles': subtitles,
1348             })
1349
1350         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1351
1352
1353 class BBCCoUkArticleIE(InfoExtractor):
1354     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1355     IE_NAME = 'bbc.co.uk:article'
1356     IE_DESC = 'BBC articles'
1357
1358     _TEST = {
1359         'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1360         'info_dict': {
1361             'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1362             'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1363             'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1364         },
1365         'playlist_count': 4,
1366         'add_ie': ['BBCCoUk'],
1367     }
1368
1369     def _real_extract(self, url):
1370         playlist_id = self._match_id(url)
1371
1372         webpage = self._download_webpage(url, playlist_id)
1373
1374         title = self._og_search_title(webpage)
1375         description = self._og_search_description(webpage).strip()
1376
1377         entries = [self.url_result(programme_url) for programme_url in re.findall(
1378             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1379
1380         return self.playlist_result(entries, playlist_id, title, description)
1381
1382
1383 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1384     def _entries(self, webpage, url, playlist_id):
1385         single_page = 'page' in compat_urlparse.parse_qs(
1386             compat_urlparse.urlparse(url).query)
1387         for page_num in itertools.count(2):
1388             for video_id in re.findall(
1389                     self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1390                 yield self.url_result(
1391                     self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1392             if single_page:
1393                 return
1394             next_page = self._search_regex(
1395                 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1396                 webpage, 'next page url', default=None, group='url')
1397             if not next_page:
1398                 break
1399             webpage = self._download_webpage(
1400                 compat_urlparse.urljoin(url, next_page), playlist_id,
1401                 'Downloading page %d' % page_num, page_num)
1402
1403     def _real_extract(self, url):
1404         playlist_id = self._match_id(url)
1405
1406         webpage = self._download_webpage(url, playlist_id)
1407
1408         title, description = self._extract_title_and_description(webpage)
1409
1410         return self.playlist_result(
1411             self._entries(webpage, url, playlist_id),
1412             playlist_id, title, description)
1413
1414
1415 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1416     _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1417
1418     @staticmethod
1419     def _get_default(episode, key, default_key='default'):
1420         return try_get(episode, lambda x: x[key][default_key])
1421
1422     def _get_description(self, data):
1423         synopsis = data.get(self._DESCRIPTION_KEY) or {}
1424         return dict_get(synopsis, ('large', 'medium', 'small'))
1425
1426     def _fetch_page(self, programme_id, per_page, series_id, page):
1427         elements = self._get_elements(self._call_api(
1428             programme_id, per_page, page + 1, series_id))
1429         for element in elements:
1430             episode = self._get_episode(element)
1431             episode_id = episode.get('id')
1432             if not episode_id:
1433                 continue
1434             thumbnail = None
1435             image = self._get_episode_image(episode)
1436             if image:
1437                 thumbnail = image.replace('{recipe}', 'raw')
1438             category = self._get_default(episode, 'labels', 'category')
1439             yield {
1440                 '_type': 'url',
1441                 'id': episode_id,
1442                 'title': self._get_episode_field(episode, 'subtitle'),
1443                 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1444                 'thumbnail': thumbnail,
1445                 'description': self._get_description(episode),
1446                 'categories': [category] if category else None,
1447                 'series': self._get_episode_field(episode, 'title'),
1448                 'ie_key': BBCCoUkIE.ie_key(),
1449             }
1450
1451     def _real_extract(self, url):
1452         pid = self._match_id(url)
1453         qs = parse_qs(url)
1454         series_id = qs.get('seriesId', [None])[0]
1455         page = qs.get('page', [None])[0]
1456         per_page = 36 if page else self._PAGE_SIZE
1457         fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1458         entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1459         playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1460         return self.playlist_result(
1461             entries, pid, self._get_playlist_title(playlist_data),
1462             self._get_description(playlist_data))
1463
1464
1465 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1466     IE_NAME = 'bbc.co.uk:iplayer:episodes'
1467     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1468     _TESTS = [{
1469         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1470         'info_dict': {
1471             'id': 'b05rcz9v',
1472             'title': 'The Disappearance',
1473             'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1474         },
1475         'playlist_mincount': 8,
1476     }, {
1477         # all seasons
1478         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1479         'info_dict': {
1480             'id': 'b094m5t9',
1481             'title': 'Doctor Foster',
1482             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1483         },
1484         'playlist_mincount': 10,
1485     }, {
1486         # explicit season
1487         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1488         'info_dict': {
1489             'id': 'b094m5t9',
1490             'title': 'Doctor Foster',
1491             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1492         },
1493         'playlist_mincount': 5,
1494     }, {
1495         # all pages
1496         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1497         'info_dict': {
1498             'id': 'm0004c4v',
1499             'title': 'Beechgrove',
1500             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1501         },
1502         'playlist_mincount': 37,
1503     }, {
1504         # explicit page
1505         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1506         'info_dict': {
1507             'id': 'm0004c4v',
1508             'title': 'Beechgrove',
1509             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1510         },
1511         'playlist_mincount': 1,
1512     }]
1513     _PAGE_SIZE = 100
1514     _DESCRIPTION_KEY = 'synopsis'
1515
1516     def _get_episode_image(self, episode):
1517         return self._get_default(episode, 'image')
1518
1519     def _get_episode_field(self, episode, field):
1520         return self._get_default(episode, field)
1521
1522     @staticmethod
1523     def _get_elements(data):
1524         return data['entities']['results']
1525
1526     @staticmethod
1527     def _get_episode(element):
1528         return element.get('episode') or {}
1529
1530     def _call_api(self, pid, per_page, page=1, series_id=None):
1531         variables = {
1532             'id': pid,
1533             'page': page,
1534             'perPage': per_page,
1535         }
1536         if series_id:
1537             variables['sliceId'] = series_id
1538         return self._download_json(
1539             'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1540                 'Content-Type': 'application/json'
1541             }, data=json.dumps({
1542                 'id': '5692d93d5aac8d796a0305e895e61551',
1543                 'variables': variables,
1544             }).encode('utf-8'))['data']['programme']
1545
1546     @staticmethod
1547     def _get_playlist_data(data):
1548         return data
1549
1550     def _get_playlist_title(self, data):
1551         return self._get_default(data, 'title')
1552
1553
1554 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1555     IE_NAME = 'bbc.co.uk:iplayer:group'
1556     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1557     _TESTS = [{
1558         # Available for over a year unlike 30 days for most other programmes
1559         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1560         'info_dict': {
1561             'id': 'p02tcc32',
1562             'title': 'Bohemian Icons',
1563             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1564         },
1565         'playlist_mincount': 10,
1566     }, {
1567         # all pages
1568         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1569         'info_dict': {
1570             'id': 'p081d7j7',
1571             'title': 'Music in Scotland',
1572             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1573         },
1574         'playlist_mincount': 47,
1575     }, {
1576         # explicit page
1577         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1578         'info_dict': {
1579             'id': 'p081d7j7',
1580             'title': 'Music in Scotland',
1581             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1582         },
1583         'playlist_mincount': 11,
1584     }]
1585     _PAGE_SIZE = 200
1586     _DESCRIPTION_KEY = 'synopses'
1587
1588     def _get_episode_image(self, episode):
1589         return self._get_default(episode, 'images', 'standard')
1590
1591     def _get_episode_field(self, episode, field):
1592         return episode.get(field)
1593
1594     @staticmethod
1595     def _get_elements(data):
1596         return data['elements']
1597
1598     @staticmethod
1599     def _get_episode(element):
1600         return element
1601
1602     def _call_api(self, pid, per_page, page=1, series_id=None):
1603         return self._download_json(
1604             'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1605             pid, query={
1606                 'page': page,
1607                 'per_page': per_page,
1608             })['group_episodes']
1609
1610     @staticmethod
1611     def _get_playlist_data(data):
1612         return data['group']
1613
1614     def _get_playlist_title(self, data):
1615         return data.get('title')
1616
1617
1618 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1619     IE_NAME = 'bbc.co.uk:playlist'
1620     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1621     _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1622     _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1623     _TESTS = [{
1624         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1625         'info_dict': {
1626             'id': 'b05rcz9v',
1627             'title': 'The Disappearance - Clips - BBC Four',
1628             'description': 'French thriller serial about a missing teenager.',
1629         },
1630         'playlist_mincount': 7,
1631     }, {
1632         # multipage playlist, explicit page
1633         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1634         'info_dict': {
1635             'id': 'b00mfl7n',
1636             'title': 'Frozen Planet - Clips - BBC One',
1637             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1638         },
1639         'playlist_mincount': 24,
1640     }, {
1641         # multipage playlist, all pages
1642         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1643         'info_dict': {
1644             'id': 'b00mfl7n',
1645             'title': 'Frozen Planet - Clips - BBC One',
1646             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1647         },
1648         'playlist_mincount': 142,
1649     }, {
1650         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1651         'only_matching': True,
1652     }, {
1653         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1654         'only_matching': True,
1655     }, {
1656         'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1657         'only_matching': True,
1658     }]
1659
1660     def _extract_title_and_description(self, webpage):
1661         title = self._og_search_title(webpage, fatal=False)
1662         description = self._og_search_description(webpage)
1663         return title, description