yt_dlp/extractor/bbc.py

   1 import functools
   2 import itertools
   3 import json
   4 import re
   5 import urllib.error
   6 import xml.etree.ElementTree
   7
   8 from .common import InfoExtractor
   9 from ..compat import compat_HTTPError, compat_str, compat_urlparse
  10 from ..utils import (
  11     ExtractorError,
  12     OnDemandPagedList,
  13     clean_html,
  14     dict_get,
  15     float_or_none,
  16     get_element_by_class,
  17     int_or_none,
  18     js_to_json,
  19     parse_duration,
  20     parse_iso8601,
  21     parse_qs,
  22     strip_or_none,
  23     try_get,
  24     unescapeHTML,
  25     unified_timestamp,
  26     url_or_none,
  27     urlencode_postdata,
  28     urljoin,
  29 )
  30
  31
  32 class BBCCoUkIE(InfoExtractor):
  33     IE_NAME = 'bbc.co.uk'
  34     IE_DESC = 'BBC iPlayer'
  35     _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  36     _VALID_URL = r'''(?x)
  37                     https?://
  38                         (?:www\.)?bbc\.co\.uk/
  39                         (?:
  40                             programmes/(?!articles/)|
  41                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  42                             music/(?:clips|audiovideo/popular)[/#]|
  43                             radio/player/|
  44                             sounds/play/|
  45                             events/[^/]+/play/[^/]+/
  46                         )
  47                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  48                     ''' % _ID_REGEX
  49     _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  50
  51     _LOGIN_URL = 'https://account.bbc.com/signin'
  52     _NETRC_MACHINE = 'bbc'
  53
  54     _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  55     _MEDIA_SETS = [
  56         # Provides HQ HLS streams with even better quality that pc mediaset but fails
  57         # with geolocation in some cases when it's even not geo restricted at all (e.g.
  58         # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  59         'iptv-all',
  60         'pc',
  61     ]
  62
  63     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  64
  65     _TESTS = [
  66         {
  67             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  68             'info_dict': {
  69                 'id': 'b039d07m',
  70                 'ext': 'flv',
  71                 'title': 'Kaleidoscope, Leonard Cohen',
  72                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  73             },
  74             'params': {
  75                 # rtmp download
  76                 'skip_download': True,
  77             }
  78         },
  79         {
  80             'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  81             'info_dict': {
  82                 'id': 'b00yng1d',
  83                 'ext': 'flv',
  84                 'title': 'The Man in Black: Series 3: The Printed Name',
  85                 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  86                 'duration': 1800,
  87             },
  88             'params': {
  89                 # rtmp download
  90                 'skip_download': True,
  91             },
  92             'skip': 'Episode is no longer available on BBC iPlayer Radio',
  93         },
  94         {
  95             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  96             'info_dict': {
  97                 'id': 'b00yng1d',
  98                 'ext': 'flv',
  99                 'title': 'The Voice UK: Series 3: Blind Auditions 5',
 100                 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
 101                 'duration': 5100,
 102             },
 103             'params': {
 104                 # rtmp download
 105                 'skip_download': True,
 106             },
 107             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 108         },
 109         {
 110             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
 111             'info_dict': {
 112                 'id': 'b03k3pb7',
 113                 'ext': 'flv',
 114                 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
 115                 'description': '2. Invasion',
 116                 'duration': 3600,
 117             },
 118             'params': {
 119                 # rtmp download
 120                 'skip_download': True,
 121             },
 122             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 123         }, {
 124             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
 125             'info_dict': {
 126                 'id': 'b04v209v',
 127                 'ext': 'flv',
 128                 'title': 'Pete Tong, The Essential New Tune Special',
 129                 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
 130                 'duration': 10800,
 131             },
 132             'params': {
 133                 # rtmp download
 134                 'skip_download': True,
 135             },
 136             'skip': 'Episode is no longer available on BBC iPlayer Radio',
 137         }, {
 138             'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
 139             'note': 'Audio',
 140             'info_dict': {
 141                 'id': 'p022h44j',
 142                 'ext': 'flv',
 143                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
 144                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
 145                 'duration': 227,
 146             },
 147             'params': {
 148                 # rtmp download
 149                 'skip_download': True,
 150             }
 151         }, {
 152             'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
 153             'note': 'Video',
 154             'info_dict': {
 155                 'id': 'p025c103',
 156                 'ext': 'flv',
 157                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
 158                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
 159                 'duration': 226,
 160             },
 161             'params': {
 162                 # rtmp download
 163                 'skip_download': True,
 164             }
 165         }, {
 166             'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
 167             'info_dict': {
 168                 'id': 'p02n76xf',
 169                 'ext': 'flv',
 170                 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
 171                 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
 172                 'duration': 3540,
 173             },
 174             'params': {
 175                 # rtmp download
 176                 'skip_download': True,
 177             },
 178             'skip': 'geolocation',
 179         }, {
 180             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
 181             'info_dict': {
 182                 'id': 'b05zmgw1',
 183                 'ext': 'flv',
 184                 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
 185                 'title': 'Royal Academy Summer Exhibition',
 186                 'duration': 3540,
 187             },
 188             'params': {
 189                 # rtmp download
 190                 'skip_download': True,
 191             },
 192             'skip': 'geolocation',
 193         }, {
 194             # iptv-all mediaset fails with geolocation however there is no geo restriction
 195             # for this programme at all
 196             'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
 197             'info_dict': {
 198                 'id': 'b06rkms3',
 199                 'ext': 'flv',
 200                 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
 201                 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
 202             },
 203             'params': {
 204                 # rtmp download
 205                 'skip_download': True,
 206             },
 207             'skip': 'Now it\'s really geo-restricted',
 208         }, {
 209             # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
 210             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
 211             'info_dict': {
 212                 'id': 'p028bfkj',
 213                 'ext': 'flv',
 214                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 215                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 216             },
 217             'params': {
 218                 # rtmp download
 219                 'skip_download': True,
 220             },
 221         }, {
 222             'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
 223             'note': 'Audio',
 224             'info_dict': {
 225                 'id': 'm0007jz9',
 226                 'ext': 'mp4',
 227                 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
 228                 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
 229                 'duration': 9840,
 230             },
 231             'params': {
 232                 # rtmp download
 233                 'skip_download': True,
 234             }
 235         }, {
 236             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
 237             'only_matching': True,
 238         }, {
 239             'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
 240             'only_matching': True,
 241         }, {
 242             'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
 243             'only_matching': True,
 244         }, {
 245             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
 246             'only_matching': True,
 247         }, {
 248             'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
 249             'only_matching': True,
 250         }, {
 251             'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
 252             'only_matching': True,
 253         }, {
 254             'url': 'https://www.bbc.co.uk/programmes/m00005xn',
 255             'only_matching': True,
 256         }, {
 257             'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
 258             'only_matching': True,
 259         }]
 260
 261     def _perform_login(self, username, password):
 262         login_page = self._download_webpage(
 263             self._LOGIN_URL, None, 'Downloading signin page')
 264
 265         login_form = self._hidden_inputs(login_page)
 266
 267         login_form.update({
 268             'username': username,
 269             'password': password,
 270         })
 271
 272         post_url = urljoin(self._LOGIN_URL, self._search_regex(
 273             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
 274             'post url', default=self._LOGIN_URL, group='url'))
 275
 276         response, urlh = self._download_webpage_handle(
 277             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
 278             headers={'Referer': self._LOGIN_URL})
 279
 280         if self._LOGIN_URL in urlh.geturl():
 281             error = clean_html(get_element_by_class('form-message', response))
 282             if error:
 283                 raise ExtractorError(
 284                     'Unable to login: %s' % error, expected=True)
 285             raise ExtractorError('Unable to log in')
 286
 287     class MediaSelectionError(Exception):
 288         def __init__(self, id):
 289             self.id = id
 290
 291     def _extract_asx_playlist(self, connection, programme_id):
 292         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
 293         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 294
 295     def _extract_items(self, playlist):
 296         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 297
 298     def _extract_medias(self, media_selection):
 299         error = media_selection.get('result')
 300         if error:
 301             raise BBCCoUkIE.MediaSelectionError(error)
 302         return media_selection.get('media') or []
 303
 304     def _extract_connections(self, media):
 305         return media.get('connection') or []
 306
 307     def _get_subtitles(self, media, programme_id):
 308         subtitles = {}
 309         for connection in self._extract_connections(media):
 310             cc_url = url_or_none(connection.get('href'))
 311             if not cc_url:
 312                 continue
 313             captions = self._download_xml(
 314                 cc_url, programme_id, 'Downloading captions', fatal=False)
 315             if not isinstance(captions, xml.etree.ElementTree.Element):
 316                 continue
 317             subtitles['en'] = [
 318                 {
 319                     'url': connection.get('href'),
 320                     'ext': 'ttml',
 321                 },
 322             ]
 323             break
 324         return subtitles
 325
 326     def _raise_extractor_error(self, media_selection_error):
 327         raise ExtractorError(
 328             '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
 329             expected=True)
 330
 331     def _download_media_selector(self, programme_id):
 332         last_exception = None
 333         for media_set in self._MEDIA_SETS:
 334             try:
 335                 return self._download_media_selector_url(
 336                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
 337             except BBCCoUkIE.MediaSelectionError as e:
 338                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
 339                     last_exception = e
 340                     continue
 341                 self._raise_extractor_error(e)
 342         self._raise_extractor_error(last_exception)
 343
 344     def _download_media_selector_url(self, url, programme_id=None):
 345         media_selection = self._download_json(
 346             url, programme_id, 'Downloading media selection JSON',
 347             expected_status=(403, 404))
 348         return self._process_media_selector(media_selection, programme_id)
 349
 350     def _process_media_selector(self, media_selection, programme_id):
 351         formats = []
 352         subtitles = None
 353         urls = []
 354
 355         for media in self._extract_medias(media_selection):
 356             kind = media.get('kind')
 357             if kind in ('video', 'audio'):
 358                 bitrate = int_or_none(media.get('bitrate'))
 359                 encoding = media.get('encoding')
 360                 width = int_or_none(media.get('width'))
 361                 height = int_or_none(media.get('height'))
 362                 file_size = int_or_none(media.get('media_file_size'))
 363                 for connection in self._extract_connections(media):
 364                     href = connection.get('href')
 365                     if href in urls:
 366                         continue
 367                     if href:
 368                         urls.append(href)
 369                     conn_kind = connection.get('kind')
 370                     protocol = connection.get('protocol')
 371                     supplier = connection.get('supplier')
 372                     transfer_format = connection.get('transferFormat')
 373                     format_id = supplier or conn_kind or protocol
 374                     # ASX playlist
 375                     if supplier == 'asx':
 376                         for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
 377                             formats.append({
 378                                 'url': ref,
 379                                 'format_id': 'ref%s_%s' % (i, format_id),
 380                             })
 381                     elif transfer_format == 'dash':
 382                         formats.extend(self._extract_mpd_formats(
 383                             href, programme_id, mpd_id=format_id, fatal=False))
 384                     elif transfer_format == 'hls':
 385                         # TODO: let expected_status be passed into _extract_xxx_formats() instead
 386                         try:
 387                             fmts = self._extract_m3u8_formats(
 388                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
 389                                 m3u8_id=format_id, fatal=False)
 390                         except ExtractorError as e:
 391                             if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
 392                                     and e.exc_info[1].code in (403, 404)):
 393                                 raise
 394                             fmts = []
 395                         formats.extend(fmts)
 396                     elif transfer_format == 'hds':
 397                         formats.extend(self._extract_f4m_formats(
 398                             href, programme_id, f4m_id=format_id, fatal=False))
 399                     else:
 400                         if not supplier and bitrate:
 401                             format_id += '-%d' % bitrate
 402                         fmt = {
 403                             'format_id': format_id,
 404                             'filesize': file_size,
 405                         }
 406                         if kind == 'video':
 407                             fmt.update({
 408                                 'width': width,
 409                                 'height': height,
 410                                 'tbr': bitrate,
 411                                 'vcodec': encoding,
 412                             })
 413                         else:
 414                             fmt.update({
 415                                 'abr': bitrate,
 416                                 'acodec': encoding,
 417                                 'vcodec': 'none',
 418                             })
 419                         if protocol in ('http', 'https'):
 420                             # Direct link
 421                             fmt.update({
 422                                 'url': href,
 423                             })
 424                         elif protocol == 'rtmp':
 425                             application = connection.get('application', 'ondemand')
 426                             auth_string = connection.get('authString')
 427                             identifier = connection.get('identifier')
 428                             server = connection.get('server')
 429                             fmt.update({
 430                                 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
 431                                 'play_path': identifier,
 432                                 'app': '%s?%s' % (application, auth_string),
 433                                 'page_url': 'http://www.bbc.co.uk',
 434                                 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
 435                                 'rtmp_live': False,
 436                                 'ext': 'flv',
 437                             })
 438                         else:
 439                             continue
 440                         formats.append(fmt)
 441             elif kind == 'captions':
 442                 subtitles = self.extract_subtitles(media, programme_id)
 443         return formats, subtitles
 444
 445     def _download_playlist(self, playlist_id):
 446         try:
 447             playlist = self._download_json(
 448                 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
 449                 playlist_id, 'Downloading playlist JSON')
 450             formats = []
 451             subtitles = {}
 452
 453             for version in playlist.get('allAvailableVersions', []):
 454                 smp_config = version['smpConfig']
 455                 title = smp_config['title']
 456                 description = smp_config['summary']
 457                 for item in smp_config['items']:
 458                     kind = item['kind']
 459                     if kind not in ('programme', 'radioProgramme'):
 460                         continue
 461                     programme_id = item.get('vpid')
 462                     duration = int_or_none(item.get('duration'))
 463                     version_formats, version_subtitles = self._download_media_selector(programme_id)
 464                     types = version['types']
 465                     for f in version_formats:
 466                         f['format_note'] = ', '.join(types)
 467                         if any('AudioDescribed' in x for x in types):
 468                             f['language_preference'] = -10
 469                     formats += version_formats
 470                     for tag, subformats in (version_subtitles or {}).items():
 471                         subtitles.setdefault(tag, []).extend(subformats)
 472
 473             return programme_id, title, description, duration, formats, subtitles
 474         except ExtractorError as ee:
 475             if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
 476                 raise
 477
 478         # fallback to legacy playlist
 479         return self._process_legacy_playlist(playlist_id)
 480
 481     def _process_legacy_playlist_url(self, url, display_id):
 482         playlist = self._download_legacy_playlist_url(url, display_id)
 483         return self._extract_from_legacy_playlist(playlist, display_id)
 484
 485     def _process_legacy_playlist(self, playlist_id):
 486         return self._process_legacy_playlist_url(
 487             'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
 488
 489     def _download_legacy_playlist_url(self, url, playlist_id=None):
 490         return self._download_xml(
 491             url, playlist_id, 'Downloading legacy playlist XML')
 492
 493     def _extract_from_legacy_playlist(self, playlist, playlist_id):
 494         no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
 495         if no_items is not None:
 496             reason = no_items.get('reason')
 497             if reason == 'preAvailability':
 498                 msg = 'Episode %s is not yet available' % playlist_id
 499             elif reason == 'postAvailability':
 500                 msg = 'Episode %s is no longer available' % playlist_id
 501             elif reason == 'noMedia':
 502                 msg = 'Episode %s is not currently available' % playlist_id
 503             else:
 504                 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
 505             raise ExtractorError(msg, expected=True)
 506
 507         for item in self._extract_items(playlist):
 508             kind = item.get('kind')
 509             if kind not in ('programme', 'radioProgramme'):
 510                 continue
 511             title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
 512             description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
 513             description = description_el.text if description_el is not None else None
 514
 515             def get_programme_id(item):
 516                 def get_from_attributes(item):
 517                     for p in ('identifier', 'group'):
 518                         value = item.get(p)
 519                         if value and re.match(r'^[pb][\da-z]{7}$', value):
 520                             return value
 521                 get_from_attributes(item)
 522                 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
 523                 if mediator is not None:
 524                     return get_from_attributes(mediator)
 525
 526             programme_id = get_programme_id(item)
 527             duration = int_or_none(item.get('duration'))
 528
 529             if programme_id:
 530                 formats, subtitles = self._download_media_selector(programme_id)
 531             else:
 532                 formats, subtitles = self._process_media_selector(item, playlist_id)
 533                 programme_id = playlist_id
 534
 535         return programme_id, title, description, duration, formats, subtitles
 536
 537     def _real_extract(self, url):
 538         group_id = self._match_id(url)
 539
 540         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 541
 542         error = self._search_regex(
 543             r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
 544             webpage, 'error', default=None)
 545         if error:
 546             raise ExtractorError(error, expected=True)
 547
 548         programme_id = None
 549         duration = None
 550
 551         tviplayer = self._search_regex(
 552             r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
 553             webpage, 'player', default=None)
 554
 555         if tviplayer:
 556             player = self._parse_json(tviplayer, group_id).get('player', {})
 557             duration = int_or_none(player.get('duration'))
 558             programme_id = player.get('vpid')
 559
 560         if not programme_id:
 561             programme_id = self._search_regex(
 562                 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 563
 564         if programme_id:
 565             formats, subtitles = self._download_media_selector(programme_id)
 566             title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 567                 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
 568                  r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
 569             description = self._search_regex(
 570                 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
 571                  r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
 572                 webpage, 'description', default=None)
 573             if not description:
 574                 description = self._html_search_meta('description', webpage)
 575         else:
 576             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 577
 578         return {
 579             'id': programme_id,
 580             'title': title,
 581             'description': description,
 582             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 583             'duration': duration,
 584             'formats': formats,
 585             'subtitles': subtitles,
 586         }
 587
 588
 589 class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
 590     IE_NAME = 'bbc'
 591     IE_DESC = 'BBC'
 592     _VALID_URL = r'''(?x)
 593         https?://(?:www\.)?(?:
 594             bbc\.(?:com|co\.uk)|
 595             bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
 596             bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
 597         )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
 598
 599     _MEDIA_SETS = [
 600         'pc',
 601         'mobile-tablet-main',
 602     ]
 603
 604     _TESTS = [{
 605         # article with multiple videos embedded with data-playable containing vpids
 606         'url': 'http://www.bbc.com/news/world-europe-32668511',
 607         'info_dict': {
 608             'id': 'world-europe-32668511',
 609             'title': 'Russia stages massive WW2 parade',
 610             'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
 611         },
 612         'playlist_count': 2,
 613     }, {
 614         # article with multiple videos embedded with data-playable (more videos)
 615         'url': 'http://www.bbc.com/news/business-28299555',
 616         'info_dict': {
 617             'id': 'business-28299555',
 618             'title': 'Farnborough Airshow: Video highlights',
 619             'description': 'BBC reports and video highlights at the Farnborough Airshow.',
 620         },
 621         'playlist_count': 9,
 622         'skip': 'Save time',
 623     }, {
 624         # article with multiple videos embedded with `new SMP()`
 625         # broken
 626         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
 627         'info_dict': {
 628             'id': '3662a707-0af9-3149-963f-47bea720b460',
 629             'title': 'BUGGER',
 630         },
 631         'playlist_count': 18,
 632     }, {
 633         # single video embedded with data-playable containing vpid
 634         'url': 'http://www.bbc.com/news/world-europe-32041533',
 635         'info_dict': {
 636             'id': 'p02mprgb',
 637             'ext': 'mp4',
 638             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 639             'description': 'md5:2868290467291b37feda7863f7a83f54',
 640             'duration': 47,
 641             'timestamp': 1427219242,
 642             'upload_date': '20150324',
 643         },
 644         'params': {
 645             # rtmp download
 646             'skip_download': True,
 647         }
 648     }, {
 649         # article with single video embedded with data-playable containing XML playlist
 650         # with direct video links as progressiveDownloadUrl (for now these are extracted)
 651         # and playlist with f4m and m3u8 as streamingUrl
 652         'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 653         'info_dict': {
 654             'id': '150615_telabyad_kentin_cogu',
 655             'ext': 'mp4',
 656             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
 657             'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
 658             'timestamp': 1434397334,
 659             'upload_date': '20150615',
 660         },
 661         'params': {
 662             'skip_download': True,
 663         }
 664     }, {
 665         # single video embedded with data-playable containing XML playlists (regional section)
 666         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 667         'info_dict': {
 668             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
 669             'ext': 'mp4',
 670             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
 671             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
 672             'timestamp': 1434713142,
 673             'upload_date': '20150619',
 674         },
 675         'params': {
 676             'skip_download': True,
 677         }
 678     }, {
 679         # single video from video playlist embedded with vxp-playlist-data JSON
 680         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
 681         'info_dict': {
 682             'id': 'p02w6qjc',
 683             'ext': 'mp4',
 684             'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 685             'duration': 56,
 686             'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 687         },
 688         'params': {
 689             'skip_download': True,
 690         }
 691     }, {
 692         # single video story with digitalData
 693         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
 694         'info_dict': {
 695             'id': 'p02q6gc4',
 696             'ext': 'flv',
 697             'title': 'Sri Lanka’s spicy secret',
 698             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
 699             'timestamp': 1437674293,
 700             'upload_date': '20150723',
 701         },
 702         'params': {
 703             # rtmp download
 704             'skip_download': True,
 705         }
 706     }, {
 707         # single video story without digitalData
 708         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
 709         'info_dict': {
 710             'id': 'p018zqqg',
 711             'ext': 'mp4',
 712             'title': 'Hyundai Santa Fe Sport: Rock star',
 713             'description': 'md5:b042a26142c4154a6e472933cf20793d',
 714             'timestamp': 1415867444,
 715             'upload_date': '20141113',
 716         },
 717         'params': {
 718             # rtmp download
 719             'skip_download': True,
 720         }
 721     }, {
 722         # single video embedded with Morph
 723         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
 724         'info_dict': {
 725             'id': 'p041vhd0',
 726             'ext': 'mp4',
 727             'title': "Nigeria v Japan - Men's First Round",
 728             'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
 729             'duration': 7980,
 730             'uploader': 'BBC Sport',
 731             'uploader_id': 'bbc_sport',
 732         },
 733         'params': {
 734             # m3u8 download
 735             'skip_download': True,
 736         },
 737         'skip': 'Georestricted to UK',
 738     }, {
 739         # single video with playlist.sxml URL in playlist param
 740         'url': 'http://www.bbc.com/sport/0/football/33653409',
 741         'info_dict': {
 742             'id': 'p02xycnp',
 743             'ext': 'mp4',
 744             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
 745             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
 746             'duration': 140,
 747         },
 748         'params': {
 749             # rtmp download
 750             'skip_download': True,
 751         }
 752     }, {
 753         # article with multiple videos embedded with playlist.sxml in playlist param
 754         'url': 'http://www.bbc.com/sport/0/football/34475836',
 755         'info_dict': {
 756             'id': '34475836',
 757             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
 758             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
 759         },
 760         'playlist_count': 3,
 761     }, {
 762         # school report article with single video
 763         'url': 'http://www.bbc.co.uk/schoolreport/35744779',
 764         'info_dict': {
 765             'id': '35744779',
 766             'title': 'School which breaks down barriers in Jerusalem',
 767         },
 768         'playlist_count': 1,
 769     }, {
 770         # single video with playlist URL from weather section
 771         'url': 'http://www.bbc.com/weather/features/33601775',
 772         'only_matching': True,
 773     }, {
 774         # custom redirection to www.bbc.com
 775         # also, video with window.__INITIAL_DATA__
 776         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
 777         'info_dict': {
 778             'id': 'p02xzws1',
 779             'ext': 'mp4',
 780             'title': "Pluto may have 'nitrogen glaciers'",
 781             'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
 782             'thumbnail': r're:https?://.+/.+\.jpg',
 783             'timestamp': 1437785037,
 784             'upload_date': '20150725',
 785         },
 786     }, {
 787         # video with window.__INITIAL_DATA__ and value as JSON string
 788         'url': 'https://www.bbc.com/news/av/world-europe-59468682',
 789         'info_dict': {
 790             'id': 'p0b71qth',
 791             'ext': 'mp4',
 792             'title': 'Why France is making this woman a national hero',
 793             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
 794             'thumbnail': r're:https?://.+/.+\.jpg',
 795             'timestamp': 1638230731,
 796             'upload_date': '20211130',
 797         },
 798     }, {
 799         # single video article embedded with data-media-vpid
 800         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
 801         'only_matching': True,
 802     }, {
 803         # bbcthreeConfig
 804         'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
 805         'info_dict': {
 806             'id': 'p06556y7',
 807             'ext': 'mp4',
 808             'title': 'Things Not To Say to people that live on council estates',
 809             'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
 810             'duration': 360,
 811             'thumbnail': r're:https?://.+/.+\.jpg',
 812         },
 813     }, {
 814         # window.__PRELOADED_STATE__
 815         'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
 816         'info_dict': {
 817             'id': 'b0b9z4vz',
 818             'ext': 'mp4',
 819             'title': 'Prom 6: An American in Paris and Turangalila',
 820             'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
 821             'uploader': 'Radio 3',
 822             'uploader_id': 'bbc_radio_three',
 823         },
 824     }, {
 825         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
 826         'info_dict': {
 827             'id': 'p06w9tws',
 828             'ext': 'mp4',
 829             'title': 'md5:2fabf12a726603193a2879a055f72514',
 830             'description': 'Learn English words and phrases from this story',
 831         },
 832         'add_ie': [BBCCoUkIE.ie_key()],
 833     }, {
 834         # BBC Reel
 835         'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
 836         'info_dict': {
 837             'id': 'p07c6sb9',
 838             'ext': 'mp4',
 839             'title': 'How positive thinking is harming your happiness',
 840             'alt_title': 'The downsides of positive thinking',
 841             'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
 842             'duration': 235,
 843             'thumbnail': r're:https?://.+/p07c9dsr.jpg',
 844             'upload_date': '20190604',
 845             'categories': ['Psychology'],
 846         },
 847     }, {  # onion routes
 848         'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
 849         'only_matching': True,
 850     }, {
 851         'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
 852         'only_matching': True,
 853     }]
 854
 855     @classmethod
 856     def suitable(cls, url):
 857         EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
 858         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
 859                 else super(BBCIE, cls).suitable(url))
 860
 861     def _extract_from_media_meta(self, media_meta, video_id):
 862         # Direct links to media in media metadata (e.g.
 863         # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 864         # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
 865         source_files = media_meta.get('sourceFiles')
 866         if source_files:
 867             return [{
 868                 'url': f['url'],
 869                 'format_id': format_id,
 870                 'ext': f.get('encoding'),
 871                 'tbr': float_or_none(f.get('bitrate'), 1000),
 872                 'filesize': int_or_none(f.get('filesize')),
 873             } for format_id, f in source_files.items() if f.get('url')], []
 874
 875         programme_id = media_meta.get('externalId')
 876         if programme_id:
 877             return self._download_media_selector(programme_id)
 878
 879         # Process playlist.sxml as legacy playlist
 880         href = media_meta.get('href')
 881         if href:
 882             playlist = self._download_legacy_playlist_url(href)
 883             _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
 884             return formats, subtitles
 885
 886         return [], []
 887
 888     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
 889         programme_id, title, description, duration, formats, subtitles = \
 890             self._process_legacy_playlist_url(url, playlist_id)
 891         return {
 892             'id': programme_id,
 893             'title': title,
 894             'description': description,
 895             'duration': duration,
 896             'timestamp': timestamp,
 897             'formats': formats,
 898             'subtitles': subtitles,
 899         }
 900
 901     def _real_extract(self, url):
 902         playlist_id = self._match_id(url)
 903
 904         webpage = self._download_webpage(url, playlist_id)
 905
 906         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
 907         timestamp = json_ld_info.get('timestamp')
 908
 909         playlist_title = json_ld_info.get('title') or re.sub(
 910             r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
 911
 912         playlist_description = json_ld_info.get(
 913             'description') or self._og_search_description(webpage, default=None)
 914
 915         if not timestamp:
 916             timestamp = parse_iso8601(self._search_regex(
 917                 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
 918                  r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
 919                  r'"datePublished":\s*"([^"]+)'],
 920                 webpage, 'date', default=None))
 921
 922         entries = []
 923
 924         # article with multiple videos embedded with playlist.sxml (e.g.
 925         # http://www.bbc.com/sport/0/football/34475836)
 926         playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
 927         playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
 928         if playlists:
 929             entries = [
 930                 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
 931                 for playlist_url in playlists]
 932
 933         # news article with multiple videos embedded with data-playable
 934         data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
 935         if data_playables:
 936             for _, data_playable_json in data_playables:
 937                 data_playable = self._parse_json(
 938                     unescapeHTML(data_playable_json), playlist_id, fatal=False)
 939                 if not data_playable:
 940                     continue
 941                 settings = data_playable.get('settings', {})
 942                 if settings:
 943                     # data-playable with video vpid in settings.playlistObject.items (e.g.
 944                     # http://www.bbc.com/news/world-us-canada-34473351)
 945                     playlist_object = settings.get('playlistObject', {})
 946                     if playlist_object:
 947                         items = playlist_object.get('items')
 948                         if items and isinstance(items, list):
 949                             title = playlist_object['title']
 950                             description = playlist_object.get('summary')
 951                             duration = int_or_none(items[0].get('duration'))
 952                             programme_id = items[0].get('vpid')
 953                             formats, subtitles = self._download_media_selector(programme_id)
 954                             entries.append({
 955                                 'id': programme_id,
 956                                 'title': title,
 957                                 'description': description,
 958                                 'timestamp': timestamp,
 959                                 'duration': duration,
 960                                 'formats': formats,
 961                                 'subtitles': subtitles,
 962                             })
 963                     else:
 964                         # data-playable without vpid but with a playlist.sxml URLs
 965                         # in otherSettings.playlist (e.g.
 966                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
 967                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
 968                         if playlist:
 969                             entry = None
 970                             for key in ('streaming', 'progressiveDownload'):
 971                                 playlist_url = playlist.get('%sUrl' % key)
 972                                 if not playlist_url:
 973                                     continue
 974                                 try:
 975                                     info = self._extract_from_playlist_sxml(
 976                                         playlist_url, playlist_id, timestamp)
 977                                     if not entry:
 978                                         entry = info
 979                                     else:
 980                                         entry['title'] = info['title']
 981                                         entry['formats'].extend(info['formats'])
 982                                 except ExtractorError as e:
 983                                     # Some playlist URL may fail with 500, at the same time
 984                                     # the other one may work fine (e.g.
 985                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 986                                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
 987                                         continue
 988                                     raise
 989                             if entry:
 990                                 entries.append(entry)
 991
 992         if entries:
 993             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
 994
 995         # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
 996         group_id = self._search_regex(
 997             r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
 998             webpage, 'group id', default=None)
 999         if group_id:
1000             return self.url_result(
1001                 'https://www.bbc.co.uk/programmes/%s' % group_id,
1002                 ie=BBCCoUkIE.ie_key())
1003
1004         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1005         programme_id = self._search_regex(
1006             [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1007              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1008              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1009             webpage, 'vpid', default=None)
1010
1011         if programme_id:
1012             formats, subtitles = self._download_media_selector(programme_id)
1013             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1014             digital_data = self._parse_json(
1015                 self._search_regex(
1016                     r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1017                 programme_id, fatal=False)
1018             page_info = digital_data.get('page', {}).get('pageInfo', {})
1019             title = page_info.get('pageName') or self._og_search_title(webpage)
1020             description = page_info.get('description') or self._og_search_description(webpage)
1021             timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1022             return {
1023                 'id': programme_id,
1024                 'title': title,
1025                 'description': description,
1026                 'timestamp': timestamp,
1027                 'formats': formats,
1028                 'subtitles': subtitles,
1029             }
1030
1031         # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1032         initial_data = self._parse_json(self._html_search_regex(
1033             r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1034             webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1035         if initial_data:
1036             init_data = try_get(
1037                 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1038             smp_data = init_data.get('smpData') or {}
1039             clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1040             version_id = clip_data.get('versionID')
1041             if version_id:
1042                 title = smp_data['title']
1043                 formats, subtitles = self._download_media_selector(version_id)
1044                 image_url = smp_data.get('holdingImageURL')
1045                 display_date = init_data.get('displayDate')
1046                 topic_title = init_data.get('topicTitle')
1047
1048                 return {
1049                     'id': version_id,
1050                     'title': title,
1051                     'formats': formats,
1052                     'alt_title': init_data.get('shortTitle'),
1053                     'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1054                     'description': smp_data.get('summary') or init_data.get('shortSummary'),
1055                     'upload_date': display_date.replace('-', '') if display_date else None,
1056                     'subtitles': subtitles,
1057                     'duration': int_or_none(clip_data.get('duration')),
1058                     'categories': [topic_title] if topic_title else None,
1059                 }
1060
1061         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1062         # There are several setPayload calls may be present but the video
1063         # seems to be always related to the first one
1064         morph_payload = self._parse_json(
1065             self._search_regex(
1066                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1067                 webpage, 'morph payload', default='{}'),
1068             playlist_id, fatal=False)
1069         if morph_payload:
1070             components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1071             for component in components:
1072                 if not isinstance(component, dict):
1073                     continue
1074                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1075                 if not lead_media:
1076                     continue
1077                 identifiers = lead_media.get('identifiers')
1078                 if not identifiers or not isinstance(identifiers, dict):
1079                     continue
1080                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1081                 if not programme_id:
1082                     continue
1083                 title = lead_media.get('title') or self._og_search_title(webpage)
1084                 formats, subtitles = self._download_media_selector(programme_id)
1085                 description = lead_media.get('summary')
1086                 uploader = lead_media.get('masterBrand')
1087                 uploader_id = lead_media.get('mid')
1088                 duration = None
1089                 duration_d = lead_media.get('duration')
1090                 if isinstance(duration_d, dict):
1091                     duration = parse_duration(dict_get(
1092                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1093                 return {
1094                     'id': programme_id,
1095                     'title': title,
1096                     'description': description,
1097                     'duration': duration,
1098                     'uploader': uploader,
1099                     'uploader_id': uploader_id,
1100                     'formats': formats,
1101                     'subtitles': subtitles,
1102                 }
1103
1104         preload_state = self._parse_json(self._search_regex(
1105             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1106             'preload state', default='{}'), playlist_id, fatal=False)
1107         if preload_state:
1108             current_programme = preload_state.get('programmes', {}).get('current') or {}
1109             programme_id = current_programme.get('id')
1110             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1111                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1112                 formats, subtitles = self._download_media_selector(programme_id)
1113                 synopses = current_programme.get('synopses') or {}
1114                 network = current_programme.get('network') or {}
1115                 duration = int_or_none(
1116                     current_programme.get('duration', {}).get('value'))
1117                 thumbnail = None
1118                 image_url = current_programme.get('image_url')
1119                 if image_url:
1120                     thumbnail = image_url.replace('{recipe}', 'raw')
1121                 return {
1122                     'id': programme_id,
1123                     'title': title,
1124                     'description': dict_get(synopses, ('long', 'medium', 'short')),
1125                     'thumbnail': thumbnail,
1126                     'duration': duration,
1127                     'uploader': network.get('short_title'),
1128                     'uploader_id': network.get('id'),
1129                     'formats': formats,
1130                     'subtitles': subtitles,
1131                 }
1132
1133         bbc3_config = self._parse_json(
1134             self._search_regex(
1135                 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1136                 'bbcthree config', default='{}'),
1137             playlist_id, transform_source=js_to_json, fatal=False) or {}
1138         payload = bbc3_config.get('payload') or {}
1139         if payload:
1140             clip = payload.get('currentClip') or {}
1141             clip_vpid = clip.get('vpid')
1142             clip_title = clip.get('title')
1143             if clip_vpid and clip_title:
1144                 formats, subtitles = self._download_media_selector(clip_vpid)
1145                 return {
1146                     'id': clip_vpid,
1147                     'title': clip_title,
1148                     'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1149                     'description': clip.get('description'),
1150                     'duration': parse_duration(clip.get('duration')),
1151                     'formats': formats,
1152                     'subtitles': subtitles,
1153                 }
1154             bbc3_playlist = try_get(
1155                 payload, lambda x: x['content']['bbcMedia']['playlist'],
1156                 dict)
1157             if bbc3_playlist:
1158                 playlist_title = bbc3_playlist.get('title') or playlist_title
1159                 thumbnail = bbc3_playlist.get('holdingImageURL')
1160                 entries = []
1161                 for bbc3_item in bbc3_playlist['items']:
1162                     programme_id = bbc3_item.get('versionID')
1163                     if not programme_id:
1164                         continue
1165                     formats, subtitles = self._download_media_selector(programme_id)
1166                     entries.append({
1167                         'id': programme_id,
1168                         'title': playlist_title,
1169                         'thumbnail': thumbnail,
1170                         'timestamp': timestamp,
1171                         'formats': formats,
1172                         'subtitles': subtitles,
1173                     })
1174                 return self.playlist_result(
1175                     entries, playlist_id, playlist_title, playlist_description)
1176
1177         initial_data = self._search_regex(
1178             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1179             'quoted preload state', default=None)
1180         if initial_data is None:
1181             initial_data = self._search_regex(
1182                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1183                 'preload state', default={})
1184         else:
1185             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1186         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1187         if initial_data:
1188             def parse_media(media):
1189                 if not media:
1190                     return
1191                 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1192                     item_id = item.get('id')
1193                     item_title = item.get('title')
1194                     if not (item_id and item_title):
1195                         continue
1196                     formats, subtitles = self._download_media_selector(item_id)
1197                     item_desc = None
1198                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1199                     if blocks:
1200                         summary = []
1201                         for block in blocks:
1202                             text = try_get(block, lambda x: x['model']['text'], compat_str)
1203                             if text:
1204                                 summary.append(text)
1205                         if summary:
1206                             item_desc = '\n\n'.join(summary)
1207                     item_time = None
1208                     for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1209                         if try_get(meta, lambda x: x['label']) == 'Published':
1210                             item_time = unified_timestamp(meta.get('timestamp'))
1211                             break
1212                     entries.append({
1213                         'id': item_id,
1214                         'title': item_title,
1215                         'thumbnail': item.get('holdingImageUrl'),
1216                         'formats': formats,
1217                         'subtitles': subtitles,
1218                         'timestamp': item_time,
1219                         'description': strip_or_none(item_desc),
1220                     })
1221             for resp in (initial_data.get('data') or {}).values():
1222                 name = resp.get('name')
1223                 if name == 'media-experience':
1224                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1225                 elif name == 'article':
1226                     for block in (try_get(resp,
1227                                           (lambda x: x['data']['blocks'],
1228                                            lambda x: x['data']['content']['model']['blocks'],),
1229                                           list) or []):
1230                         if block.get('type') not in ['media', 'video']:
1231                             continue
1232                         parse_media(block.get('model'))
1233             return self.playlist_result(
1234                 entries, playlist_id, playlist_title, playlist_description)
1235
1236         def extract_all(pattern):
1237             return list(filter(None, map(
1238                 lambda s: self._parse_json(s, playlist_id, fatal=False),
1239                 re.findall(pattern, webpage))))
1240
1241         # Multiple video article (e.g.
1242         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1243         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1244         entries = []
1245         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1246             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1247             if embed_url and re.match(EMBED_URL, embed_url):
1248                 entries.append(embed_url)
1249         entries.extend(re.findall(
1250             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1251         if entries:
1252             return self.playlist_result(
1253                 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1254                 playlist_id, playlist_title, playlist_description)
1255
1256         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1257         medias = extract_all(r"data-media-meta='({[^']+})'")
1258
1259         if not medias:
1260             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1261             media_asset = self._search_regex(
1262                 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1263                 webpage, 'media asset', default=None)
1264             if media_asset:
1265                 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1266                 medias = []
1267                 for video in media_asset_page.get('videos', {}).values():
1268                     medias.extend(video.values())
1269
1270         if not medias:
1271             # Multiple video playlist with single `now playing` entry (e.g.
1272             # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1273             vxp_playlist = self._parse_json(
1274                 self._search_regex(
1275                     r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1276                     webpage, 'playlist data'),
1277                 playlist_id)
1278             playlist_medias = []
1279             for item in vxp_playlist:
1280                 media = item.get('media')
1281                 if not media:
1282                     continue
1283                 playlist_medias.append(media)
1284                 # Download single video if found media with asset id matching the video id from URL
1285                 if item.get('advert', {}).get('assetId') == playlist_id:
1286                     medias = [media]
1287                     break
1288             # Fallback to the whole playlist
1289             if not medias:
1290                 medias = playlist_medias
1291
1292         entries = []
1293         for num, media_meta in enumerate(medias, start=1):
1294             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1295             if not formats and not self.get_param('ignore_no_formats'):
1296                 continue
1297
1298             video_id = media_meta.get('externalId')
1299             if not video_id:
1300                 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1301
1302             title = media_meta.get('caption')
1303             if not title:
1304                 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1305
1306             duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1307
1308             images = []
1309             for image in media_meta.get('images', {}).values():
1310                 images.extend(image.values())
1311             if 'image' in media_meta:
1312                 images.append(media_meta['image'])
1313
1314             thumbnails = [{
1315                 'url': image.get('href'),
1316                 'width': int_or_none(image.get('width')),
1317                 'height': int_or_none(image.get('height')),
1318             } for image in images]
1319
1320             entries.append({
1321                 'id': video_id,
1322                 'title': title,
1323                 'thumbnails': thumbnails,
1324                 'duration': duration,
1325                 'timestamp': timestamp,
1326                 'formats': formats,
1327                 'subtitles': subtitles,
1328             })
1329
1330         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1331
1332
1333 class BBCCoUkArticleIE(InfoExtractor):
1334     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1335     IE_NAME = 'bbc.co.uk:article'
1336     IE_DESC = 'BBC articles'
1337
1338     _TEST = {
1339         'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1340         'info_dict': {
1341             'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1342             'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1343             'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1344         },
1345         'playlist_count': 4,
1346         'add_ie': ['BBCCoUk'],
1347     }
1348
1349     def _real_extract(self, url):
1350         playlist_id = self._match_id(url)
1351
1352         webpage = self._download_webpage(url, playlist_id)
1353
1354         title = self._og_search_title(webpage)
1355         description = self._og_search_description(webpage).strip()
1356
1357         entries = [self.url_result(programme_url) for programme_url in re.findall(
1358             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1359
1360         return self.playlist_result(entries, playlist_id, title, description)
1361
1362
1363 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1364     def _entries(self, webpage, url, playlist_id):
1365         single_page = 'page' in compat_urlparse.parse_qs(
1366             compat_urlparse.urlparse(url).query)
1367         for page_num in itertools.count(2):
1368             for video_id in re.findall(
1369                     self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1370                 yield self.url_result(
1371                     self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1372             if single_page:
1373                 return
1374             next_page = self._search_regex(
1375                 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1376                 webpage, 'next page url', default=None, group='url')
1377             if not next_page:
1378                 break
1379             webpage = self._download_webpage(
1380                 compat_urlparse.urljoin(url, next_page), playlist_id,
1381                 'Downloading page %d' % page_num, page_num)
1382
1383     def _real_extract(self, url):
1384         playlist_id = self._match_id(url)
1385
1386         webpage = self._download_webpage(url, playlist_id)
1387
1388         title, description = self._extract_title_and_description(webpage)
1389
1390         return self.playlist_result(
1391             self._entries(webpage, url, playlist_id),
1392             playlist_id, title, description)
1393
1394
1395 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1396     _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1397
1398     @staticmethod
1399     def _get_default(episode, key, default_key='default'):
1400         return try_get(episode, lambda x: x[key][default_key])
1401
1402     def _get_description(self, data):
1403         synopsis = data.get(self._DESCRIPTION_KEY) or {}
1404         return dict_get(synopsis, ('large', 'medium', 'small'))
1405
1406     def _fetch_page(self, programme_id, per_page, series_id, page):
1407         elements = self._get_elements(self._call_api(
1408             programme_id, per_page, page + 1, series_id))
1409         for element in elements:
1410             episode = self._get_episode(element)
1411             episode_id = episode.get('id')
1412             if not episode_id:
1413                 continue
1414             thumbnail = None
1415             image = self._get_episode_image(episode)
1416             if image:
1417                 thumbnail = image.replace('{recipe}', 'raw')
1418             category = self._get_default(episode, 'labels', 'category')
1419             yield {
1420                 '_type': 'url',
1421                 'id': episode_id,
1422                 'title': self._get_episode_field(episode, 'subtitle'),
1423                 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1424                 'thumbnail': thumbnail,
1425                 'description': self._get_description(episode),
1426                 'categories': [category] if category else None,
1427                 'series': self._get_episode_field(episode, 'title'),
1428                 'ie_key': BBCCoUkIE.ie_key(),
1429             }
1430
1431     def _real_extract(self, url):
1432         pid = self._match_id(url)
1433         qs = parse_qs(url)
1434         series_id = qs.get('seriesId', [None])[0]
1435         page = qs.get('page', [None])[0]
1436         per_page = 36 if page else self._PAGE_SIZE
1437         fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1438         entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1439         playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1440         return self.playlist_result(
1441             entries, pid, self._get_playlist_title(playlist_data),
1442             self._get_description(playlist_data))
1443
1444
1445 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1446     IE_NAME = 'bbc.co.uk:iplayer:episodes'
1447     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1448     _TESTS = [{
1449         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1450         'info_dict': {
1451             'id': 'b05rcz9v',
1452             'title': 'The Disappearance',
1453             'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1454         },
1455         'playlist_mincount': 8,
1456     }, {
1457         # all seasons
1458         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1459         'info_dict': {
1460             'id': 'b094m5t9',
1461             'title': 'Doctor Foster',
1462             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1463         },
1464         'playlist_mincount': 10,
1465     }, {
1466         # explicit season
1467         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1468         'info_dict': {
1469             'id': 'b094m5t9',
1470             'title': 'Doctor Foster',
1471             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1472         },
1473         'playlist_mincount': 5,
1474     }, {
1475         # all pages
1476         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1477         'info_dict': {
1478             'id': 'm0004c4v',
1479             'title': 'Beechgrove',
1480             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1481         },
1482         'playlist_mincount': 37,
1483     }, {
1484         # explicit page
1485         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1486         'info_dict': {
1487             'id': 'm0004c4v',
1488             'title': 'Beechgrove',
1489             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1490         },
1491         'playlist_mincount': 1,
1492     }]
1493     _PAGE_SIZE = 100
1494     _DESCRIPTION_KEY = 'synopsis'
1495
1496     def _get_episode_image(self, episode):
1497         return self._get_default(episode, 'image')
1498
1499     def _get_episode_field(self, episode, field):
1500         return self._get_default(episode, field)
1501
1502     @staticmethod
1503     def _get_elements(data):
1504         return data['entities']['results']
1505
1506     @staticmethod
1507     def _get_episode(element):
1508         return element.get('episode') or {}
1509
1510     def _call_api(self, pid, per_page, page=1, series_id=None):
1511         variables = {
1512             'id': pid,
1513             'page': page,
1514             'perPage': per_page,
1515         }
1516         if series_id:
1517             variables['sliceId'] = series_id
1518         return self._download_json(
1519             'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1520                 'Content-Type': 'application/json'
1521             }, data=json.dumps({
1522                 'id': '5692d93d5aac8d796a0305e895e61551',
1523                 'variables': variables,
1524             }).encode('utf-8'))['data']['programme']
1525
1526     @staticmethod
1527     def _get_playlist_data(data):
1528         return data
1529
1530     def _get_playlist_title(self, data):
1531         return self._get_default(data, 'title')
1532
1533
1534 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1535     IE_NAME = 'bbc.co.uk:iplayer:group'
1536     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1537     _TESTS = [{
1538         # Available for over a year unlike 30 days for most other programmes
1539         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1540         'info_dict': {
1541             'id': 'p02tcc32',
1542             'title': 'Bohemian Icons',
1543             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1544         },
1545         'playlist_mincount': 10,
1546     }, {
1547         # all pages
1548         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1549         'info_dict': {
1550             'id': 'p081d7j7',
1551             'title': 'Music in Scotland',
1552             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1553         },
1554         'playlist_mincount': 47,
1555     }, {
1556         # explicit page
1557         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1558         'info_dict': {
1559             'id': 'p081d7j7',
1560             'title': 'Music in Scotland',
1561             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1562         },
1563         'playlist_mincount': 11,
1564     }]
1565     _PAGE_SIZE = 200
1566     _DESCRIPTION_KEY = 'synopses'
1567
1568     def _get_episode_image(self, episode):
1569         return self._get_default(episode, 'images', 'standard')
1570
1571     def _get_episode_field(self, episode, field):
1572         return episode.get(field)
1573
1574     @staticmethod
1575     def _get_elements(data):
1576         return data['elements']
1577
1578     @staticmethod
1579     def _get_episode(element):
1580         return element
1581
1582     def _call_api(self, pid, per_page, page=1, series_id=None):
1583         return self._download_json(
1584             'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1585             pid, query={
1586                 'page': page,
1587                 'per_page': per_page,
1588             })['group_episodes']
1589
1590     @staticmethod
1591     def _get_playlist_data(data):
1592         return data['group']
1593
1594     def _get_playlist_title(self, data):
1595         return data.get('title')
1596
1597
1598 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1599     IE_NAME = 'bbc.co.uk:playlist'
1600     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1601     _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1602     _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1603     _TESTS = [{
1604         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1605         'info_dict': {
1606             'id': 'b05rcz9v',
1607             'title': 'The Disappearance - Clips - BBC Four',
1608             'description': 'French thriller serial about a missing teenager.',
1609         },
1610         'playlist_mincount': 7,
1611     }, {
1612         # multipage playlist, explicit page
1613         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1614         'info_dict': {
1615             'id': 'b00mfl7n',
1616             'title': 'Frozen Planet - Clips - BBC One',
1617             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1618         },
1619         'playlist_mincount': 24,
1620     }, {
1621         # multipage playlist, all pages
1622         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1623         'info_dict': {
1624             'id': 'b00mfl7n',
1625             'title': 'Frozen Planet - Clips - BBC One',
1626             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1627         },
1628         'playlist_mincount': 142,
1629     }, {
1630         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1631         'only_matching': True,
1632     }, {
1633         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1634         'only_matching': True,
1635     }, {
1636         'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1637         'only_matching': True,
1638     }]
1639
1640     def _extract_title_and_description(self, webpage):
1641         title = self._og_search_title(webpage, fatal=False)
1642         description = self._og_search_description(webpage)
1643         return title, description