yt_dlp/extractor/bbc.py

   1 import xml.etree.ElementTree
   2 import functools
   3 import itertools
   4 import json
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..compat import (
   9     compat_HTTPError,
  10     compat_str,
  11     compat_urllib_error,
  12     compat_urlparse,
  13 )
  14 from ..utils import (
  15     ExtractorError,
  16     OnDemandPagedList,
  17     clean_html,
  18     dict_get,
  19     float_or_none,
  20     get_element_by_class,
  21     int_or_none,
  22     js_to_json,
  23     parse_duration,
  24     parse_iso8601,
  25     parse_qs,
  26     strip_or_none,
  27     try_get,
  28     unescapeHTML,
  29     unified_timestamp,
  30     url_or_none,
  31     urlencode_postdata,
  32     urljoin,
  33 )
  34
  35
  36 class BBCCoUkIE(InfoExtractor):
  37     IE_NAME = 'bbc.co.uk'
  38     IE_DESC = 'BBC iPlayer'
  39     _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  40     _VALID_URL = r'''(?x)
  41                     https?://
  42                         (?:www\.)?bbc\.co\.uk/
  43                         (?:
  44                             programmes/(?!articles/)|
  45                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  46                             music/(?:clips|audiovideo/popular)[/#]|
  47                             radio/player/|
  48                             sounds/play/|
  49                             events/[^/]+/play/[^/]+/
  50                         )
  51                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  52                     ''' % _ID_REGEX
  53
  54     _LOGIN_URL = 'https://account.bbc.com/signin'
  55     _NETRC_MACHINE = 'bbc'
  56
  57     _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  58     _MEDIA_SETS = [
  59         # Provides HQ HLS streams with even better quality that pc mediaset but fails
  60         # with geolocation in some cases when it's even not geo restricted at all (e.g.
  61         # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  62         'iptv-all',
  63         'pc',
  64     ]
  65
  66     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  67
  68     _TESTS = [
  69         {
  70             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  71             'info_dict': {
  72                 'id': 'b039d07m',
  73                 'ext': 'flv',
  74                 'title': 'Kaleidoscope, Leonard Cohen',
  75                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  76             },
  77             'params': {
  78                 # rtmp download
  79                 'skip_download': True,
  80             }
  81         },
  82         {
  83             'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  84             'info_dict': {
  85                 'id': 'b00yng1d',
  86                 'ext': 'flv',
  87                 'title': 'The Man in Black: Series 3: The Printed Name',
  88                 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  89                 'duration': 1800,
  90             },
  91             'params': {
  92                 # rtmp download
  93                 'skip_download': True,
  94             },
  95             'skip': 'Episode is no longer available on BBC iPlayer Radio',
  96         },
  97         {
  98             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  99             'info_dict': {
 100                 'id': 'b00yng1d',
 101                 'ext': 'flv',
 102                 'title': 'The Voice UK: Series 3: Blind Auditions 5',
 103                 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
 104                 'duration': 5100,
 105             },
 106             'params': {
 107                 # rtmp download
 108                 'skip_download': True,
 109             },
 110             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 111         },
 112         {
 113             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
 114             'info_dict': {
 115                 'id': 'b03k3pb7',
 116                 'ext': 'flv',
 117                 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
 118                 'description': '2. Invasion',
 119                 'duration': 3600,
 120             },
 121             'params': {
 122                 # rtmp download
 123                 'skip_download': True,
 124             },
 125             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 126         }, {
 127             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
 128             'info_dict': {
 129                 'id': 'b04v209v',
 130                 'ext': 'flv',
 131                 'title': 'Pete Tong, The Essential New Tune Special',
 132                 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
 133                 'duration': 10800,
 134             },
 135             'params': {
 136                 # rtmp download
 137                 'skip_download': True,
 138             },
 139             'skip': 'Episode is no longer available on BBC iPlayer Radio',
 140         }, {
 141             'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
 142             'note': 'Audio',
 143             'info_dict': {
 144                 'id': 'p022h44j',
 145                 'ext': 'flv',
 146                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
 147                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
 148                 'duration': 227,
 149             },
 150             'params': {
 151                 # rtmp download
 152                 'skip_download': True,
 153             }
 154         }, {
 155             'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
 156             'note': 'Video',
 157             'info_dict': {
 158                 'id': 'p025c103',
 159                 'ext': 'flv',
 160                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
 161                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
 162                 'duration': 226,
 163             },
 164             'params': {
 165                 # rtmp download
 166                 'skip_download': True,
 167             }
 168         }, {
 169             'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
 170             'info_dict': {
 171                 'id': 'p02n76xf',
 172                 'ext': 'flv',
 173                 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
 174                 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
 175                 'duration': 3540,
 176             },
 177             'params': {
 178                 # rtmp download
 179                 'skip_download': True,
 180             },
 181             'skip': 'geolocation',
 182         }, {
 183             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
 184             'info_dict': {
 185                 'id': 'b05zmgw1',
 186                 'ext': 'flv',
 187                 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
 188                 'title': 'Royal Academy Summer Exhibition',
 189                 'duration': 3540,
 190             },
 191             'params': {
 192                 # rtmp download
 193                 'skip_download': True,
 194             },
 195             'skip': 'geolocation',
 196         }, {
 197             # iptv-all mediaset fails with geolocation however there is no geo restriction
 198             # for this programme at all
 199             'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
 200             'info_dict': {
 201                 'id': 'b06rkms3',
 202                 'ext': 'flv',
 203                 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
 204                 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
 205             },
 206             'params': {
 207                 # rtmp download
 208                 'skip_download': True,
 209             },
 210             'skip': 'Now it\'s really geo-restricted',
 211         }, {
 212             # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
 213             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
 214             'info_dict': {
 215                 'id': 'p028bfkj',
 216                 'ext': 'flv',
 217                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 218                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 219             },
 220             'params': {
 221                 # rtmp download
 222                 'skip_download': True,
 223             },
 224         }, {
 225             'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
 226             'note': 'Audio',
 227             'info_dict': {
 228                 'id': 'm0007jz9',
 229                 'ext': 'mp4',
 230                 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
 231                 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
 232                 'duration': 9840,
 233             },
 234             'params': {
 235                 # rtmp download
 236                 'skip_download': True,
 237             }
 238         }, {
 239             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
 240             'only_matching': True,
 241         }, {
 242             'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
 243             'only_matching': True,
 244         }, {
 245             'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
 246             'only_matching': True,
 247         }, {
 248             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
 249             'only_matching': True,
 250         }, {
 251             'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
 252             'only_matching': True,
 253         }, {
 254             'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
 255             'only_matching': True,
 256         }, {
 257             'url': 'https://www.bbc.co.uk/programmes/m00005xn',
 258             'only_matching': True,
 259         }, {
 260             'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
 261             'only_matching': True,
 262         }]
 263
 264     def _perform_login(self, username, password):
 265         login_page = self._download_webpage(
 266             self._LOGIN_URL, None, 'Downloading signin page')
 267
 268         login_form = self._hidden_inputs(login_page)
 269
 270         login_form.update({
 271             'username': username,
 272             'password': password,
 273         })
 274
 275         post_url = urljoin(self._LOGIN_URL, self._search_regex(
 276             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
 277             'post url', default=self._LOGIN_URL, group='url'))
 278
 279         response, urlh = self._download_webpage_handle(
 280             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
 281             headers={'Referer': self._LOGIN_URL})
 282
 283         if self._LOGIN_URL in urlh.geturl():
 284             error = clean_html(get_element_by_class('form-message', response))
 285             if error:
 286                 raise ExtractorError(
 287                     'Unable to login: %s' % error, expected=True)
 288             raise ExtractorError('Unable to log in')
 289
 290     class MediaSelectionError(Exception):
 291         def __init__(self, id):
 292             self.id = id
 293
 294     def _extract_asx_playlist(self, connection, programme_id):
 295         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
 296         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 297
 298     def _extract_items(self, playlist):
 299         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 300
 301     def _extract_medias(self, media_selection):
 302         error = media_selection.get('result')
 303         if error:
 304             raise BBCCoUkIE.MediaSelectionError(error)
 305         return media_selection.get('media') or []
 306
 307     def _extract_connections(self, media):
 308         return media.get('connection') or []
 309
 310     def _get_subtitles(self, media, programme_id):
 311         subtitles = {}
 312         for connection in self._extract_connections(media):
 313             cc_url = url_or_none(connection.get('href'))
 314             if not cc_url:
 315                 continue
 316             captions = self._download_xml(
 317                 cc_url, programme_id, 'Downloading captions', fatal=False)
 318             if not isinstance(captions, xml.etree.ElementTree.Element):
 319                 continue
 320             subtitles['en'] = [
 321                 {
 322                     'url': connection.get('href'),
 323                     'ext': 'ttml',
 324                 },
 325             ]
 326             break
 327         return subtitles
 328
 329     def _raise_extractor_error(self, media_selection_error):
 330         raise ExtractorError(
 331             '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
 332             expected=True)
 333
 334     def _download_media_selector(self, programme_id):
 335         last_exception = None
 336         for media_set in self._MEDIA_SETS:
 337             try:
 338                 return self._download_media_selector_url(
 339                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
 340             except BBCCoUkIE.MediaSelectionError as e:
 341                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
 342                     last_exception = e
 343                     continue
 344                 self._raise_extractor_error(e)
 345         self._raise_extractor_error(last_exception)
 346
 347     def _download_media_selector_url(self, url, programme_id=None):
 348         media_selection = self._download_json(
 349             url, programme_id, 'Downloading media selection JSON',
 350             expected_status=(403, 404))
 351         return self._process_media_selector(media_selection, programme_id)
 352
 353     def _process_media_selector(self, media_selection, programme_id):
 354         formats = []
 355         subtitles = None
 356         urls = []
 357
 358         for media in self._extract_medias(media_selection):
 359             kind = media.get('kind')
 360             if kind in ('video', 'audio'):
 361                 bitrate = int_or_none(media.get('bitrate'))
 362                 encoding = media.get('encoding')
 363                 width = int_or_none(media.get('width'))
 364                 height = int_or_none(media.get('height'))
 365                 file_size = int_or_none(media.get('media_file_size'))
 366                 for connection in self._extract_connections(media):
 367                     href = connection.get('href')
 368                     if href in urls:
 369                         continue
 370                     if href:
 371                         urls.append(href)
 372                     conn_kind = connection.get('kind')
 373                     protocol = connection.get('protocol')
 374                     supplier = connection.get('supplier')
 375                     transfer_format = connection.get('transferFormat')
 376                     format_id = supplier or conn_kind or protocol
 377                     # ASX playlist
 378                     if supplier == 'asx':
 379                         for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
 380                             formats.append({
 381                                 'url': ref,
 382                                 'format_id': 'ref%s_%s' % (i, format_id),
 383                             })
 384                     elif transfer_format == 'dash':
 385                         formats.extend(self._extract_mpd_formats(
 386                             href, programme_id, mpd_id=format_id, fatal=False))
 387                     elif transfer_format == 'hls':
 388                         # TODO: let expected_status be passed into _extract_xxx_formats() instead
 389                         try:
 390                             fmts = self._extract_m3u8_formats(
 391                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
 392                                 m3u8_id=format_id, fatal=False)
 393                         except ExtractorError as e:
 394                             if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
 395                                     and e.exc_info[1].code in (403, 404)):
 396                                 raise
 397                             fmts = []
 398                         formats.extend(fmts)
 399                     elif transfer_format == 'hds':
 400                         formats.extend(self._extract_f4m_formats(
 401                             href, programme_id, f4m_id=format_id, fatal=False))
 402                     else:
 403                         if not supplier and bitrate:
 404                             format_id += '-%d' % bitrate
 405                         fmt = {
 406                             'format_id': format_id,
 407                             'filesize': file_size,
 408                         }
 409                         if kind == 'video':
 410                             fmt.update({
 411                                 'width': width,
 412                                 'height': height,
 413                                 'tbr': bitrate,
 414                                 'vcodec': encoding,
 415                             })
 416                         else:
 417                             fmt.update({
 418                                 'abr': bitrate,
 419                                 'acodec': encoding,
 420                                 'vcodec': 'none',
 421                             })
 422                         if protocol in ('http', 'https'):
 423                             # Direct link
 424                             fmt.update({
 425                                 'url': href,
 426                             })
 427                         elif protocol == 'rtmp':
 428                             application = connection.get('application', 'ondemand')
 429                             auth_string = connection.get('authString')
 430                             identifier = connection.get('identifier')
 431                             server = connection.get('server')
 432                             fmt.update({
 433                                 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
 434                                 'play_path': identifier,
 435                                 'app': '%s?%s' % (application, auth_string),
 436                                 'page_url': 'http://www.bbc.co.uk',
 437                                 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
 438                                 'rtmp_live': False,
 439                                 'ext': 'flv',
 440                             })
 441                         else:
 442                             continue
 443                         formats.append(fmt)
 444             elif kind == 'captions':
 445                 subtitles = self.extract_subtitles(media, programme_id)
 446         return formats, subtitles
 447
 448     def _download_playlist(self, playlist_id):
 449         try:
 450             playlist = self._download_json(
 451                 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
 452                 playlist_id, 'Downloading playlist JSON')
 453             formats = []
 454             subtitles = {}
 455
 456             for version in playlist.get('allAvailableVersions', []):
 457                 smp_config = version['smpConfig']
 458                 title = smp_config['title']
 459                 description = smp_config['summary']
 460                 for item in smp_config['items']:
 461                     kind = item['kind']
 462                     if kind not in ('programme', 'radioProgramme'):
 463                         continue
 464                     programme_id = item.get('vpid')
 465                     duration = int_or_none(item.get('duration'))
 466                     version_formats, version_subtitles = self._download_media_selector(programme_id)
 467                     types = version['types']
 468                     for f in version_formats:
 469                         f['format_note'] = ', '.join(types)
 470                         if any('AudioDescribed' in x for x in types):
 471                             f['language_preference'] = -10
 472                     formats += version_formats
 473                     for tag, subformats in (version_subtitles or {}).items():
 474                         subtitles.setdefault(tag, []).extend(subformats)
 475
 476             return programme_id, title, description, duration, formats, subtitles
 477         except ExtractorError as ee:
 478             if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
 479                 raise
 480
 481         # fallback to legacy playlist
 482         return self._process_legacy_playlist(playlist_id)
 483
 484     def _process_legacy_playlist_url(self, url, display_id):
 485         playlist = self._download_legacy_playlist_url(url, display_id)
 486         return self._extract_from_legacy_playlist(playlist, display_id)
 487
 488     def _process_legacy_playlist(self, playlist_id):
 489         return self._process_legacy_playlist_url(
 490             'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
 491
 492     def _download_legacy_playlist_url(self, url, playlist_id=None):
 493         return self._download_xml(
 494             url, playlist_id, 'Downloading legacy playlist XML')
 495
 496     def _extract_from_legacy_playlist(self, playlist, playlist_id):
 497         no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
 498         if no_items is not None:
 499             reason = no_items.get('reason')
 500             if reason == 'preAvailability':
 501                 msg = 'Episode %s is not yet available' % playlist_id
 502             elif reason == 'postAvailability':
 503                 msg = 'Episode %s is no longer available' % playlist_id
 504             elif reason == 'noMedia':
 505                 msg = 'Episode %s is not currently available' % playlist_id
 506             else:
 507                 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
 508             raise ExtractorError(msg, expected=True)
 509
 510         for item in self._extract_items(playlist):
 511             kind = item.get('kind')
 512             if kind not in ('programme', 'radioProgramme'):
 513                 continue
 514             title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
 515             description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
 516             description = description_el.text if description_el is not None else None
 517
 518             def get_programme_id(item):
 519                 def get_from_attributes(item):
 520                     for p in ('identifier', 'group'):
 521                         value = item.get(p)
 522                         if value and re.match(r'^[pb][\da-z]{7}$', value):
 523                             return value
 524                 get_from_attributes(item)
 525                 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
 526                 if mediator is not None:
 527                     return get_from_attributes(mediator)
 528
 529             programme_id = get_programme_id(item)
 530             duration = int_or_none(item.get('duration'))
 531
 532             if programme_id:
 533                 formats, subtitles = self._download_media_selector(programme_id)
 534             else:
 535                 formats, subtitles = self._process_media_selector(item, playlist_id)
 536                 programme_id = playlist_id
 537
 538         return programme_id, title, description, duration, formats, subtitles
 539
 540     def _real_extract(self, url):
 541         group_id = self._match_id(url)
 542
 543         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 544
 545         error = self._search_regex(
 546             r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
 547             webpage, 'error', default=None)
 548         if error:
 549             raise ExtractorError(error, expected=True)
 550
 551         programme_id = None
 552         duration = None
 553
 554         tviplayer = self._search_regex(
 555             r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
 556             webpage, 'player', default=None)
 557
 558         if tviplayer:
 559             player = self._parse_json(tviplayer, group_id).get('player', {})
 560             duration = int_or_none(player.get('duration'))
 561             programme_id = player.get('vpid')
 562
 563         if not programme_id:
 564             programme_id = self._search_regex(
 565                 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 566
 567         if programme_id:
 568             formats, subtitles = self._download_media_selector(programme_id)
 569             title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 570                 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
 571                  r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
 572             description = self._search_regex(
 573                 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
 574                  r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
 575                 webpage, 'description', default=None)
 576             if not description:
 577                 description = self._html_search_meta('description', webpage)
 578         else:
 579             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 580
 581         self._sort_formats(formats)
 582
 583         return {
 584             'id': programme_id,
 585             'title': title,
 586             'description': description,
 587             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 588             'duration': duration,
 589             'formats': formats,
 590             'subtitles': subtitles,
 591         }
 592
 593
 594 class BBCIE(BBCCoUkIE):
 595     IE_NAME = 'bbc'
 596     IE_DESC = 'BBC'
 597     _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
 598
 599     _MEDIA_SETS = [
 600         'pc',
 601         'mobile-tablet-main',
 602     ]
 603
 604     _TESTS = [{
 605         # article with multiple videos embedded with data-playable containing vpids
 606         'url': 'http://www.bbc.com/news/world-europe-32668511',
 607         'info_dict': {
 608             'id': 'world-europe-32668511',
 609             'title': 'Russia stages massive WW2 parade',
 610             'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
 611         },
 612         'playlist_count': 2,
 613     }, {
 614         # article with multiple videos embedded with data-playable (more videos)
 615         'url': 'http://www.bbc.com/news/business-28299555',
 616         'info_dict': {
 617             'id': 'business-28299555',
 618             'title': 'Farnborough Airshow: Video highlights',
 619             'description': 'BBC reports and video highlights at the Farnborough Airshow.',
 620         },
 621         'playlist_count': 9,
 622         'skip': 'Save time',
 623     }, {
 624         # article with multiple videos embedded with `new SMP()`
 625         # broken
 626         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
 627         'info_dict': {
 628             'id': '3662a707-0af9-3149-963f-47bea720b460',
 629             'title': 'BUGGER',
 630         },
 631         'playlist_count': 18,
 632     }, {
 633         # single video embedded with data-playable containing vpid
 634         'url': 'http://www.bbc.com/news/world-europe-32041533',
 635         'info_dict': {
 636             'id': 'p02mprgb',
 637             'ext': 'mp4',
 638             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 639             'description': 'md5:2868290467291b37feda7863f7a83f54',
 640             'duration': 47,
 641             'timestamp': 1427219242,
 642             'upload_date': '20150324',
 643         },
 644         'params': {
 645             # rtmp download
 646             'skip_download': True,
 647         }
 648     }, {
 649         # article with single video embedded with data-playable containing XML playlist
 650         # with direct video links as progressiveDownloadUrl (for now these are extracted)
 651         # and playlist with f4m and m3u8 as streamingUrl
 652         'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 653         'info_dict': {
 654             'id': '150615_telabyad_kentin_cogu',
 655             'ext': 'mp4',
 656             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
 657             'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
 658             'timestamp': 1434397334,
 659             'upload_date': '20150615',
 660         },
 661         'params': {
 662             'skip_download': True,
 663         }
 664     }, {
 665         # single video embedded with data-playable containing XML playlists (regional section)
 666         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 667         'info_dict': {
 668             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
 669             'ext': 'mp4',
 670             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
 671             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
 672             'timestamp': 1434713142,
 673             'upload_date': '20150619',
 674         },
 675         'params': {
 676             'skip_download': True,
 677         }
 678     }, {
 679         # single video from video playlist embedded with vxp-playlist-data JSON
 680         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
 681         'info_dict': {
 682             'id': 'p02w6qjc',
 683             'ext': 'mp4',
 684             'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 685             'duration': 56,
 686             'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 687         },
 688         'params': {
 689             'skip_download': True,
 690         }
 691     }, {
 692         # single video story with digitalData
 693         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
 694         'info_dict': {
 695             'id': 'p02q6gc4',
 696             'ext': 'flv',
 697             'title': 'Sri Lanka’s spicy secret',
 698             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
 699             'timestamp': 1437674293,
 700             'upload_date': '20150723',
 701         },
 702         'params': {
 703             # rtmp download
 704             'skip_download': True,
 705         }
 706     }, {
 707         # single video story without digitalData
 708         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
 709         'info_dict': {
 710             'id': 'p018zqqg',
 711             'ext': 'mp4',
 712             'title': 'Hyundai Santa Fe Sport: Rock star',
 713             'description': 'md5:b042a26142c4154a6e472933cf20793d',
 714             'timestamp': 1415867444,
 715             'upload_date': '20141113',
 716         },
 717         'params': {
 718             # rtmp download
 719             'skip_download': True,
 720         }
 721     }, {
 722         # single video embedded with Morph
 723         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
 724         'info_dict': {
 725             'id': 'p041vhd0',
 726             'ext': 'mp4',
 727             'title': "Nigeria v Japan - Men's First Round",
 728             'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
 729             'duration': 7980,
 730             'uploader': 'BBC Sport',
 731             'uploader_id': 'bbc_sport',
 732         },
 733         'params': {
 734             # m3u8 download
 735             'skip_download': True,
 736         },
 737         'skip': 'Georestricted to UK',
 738     }, {
 739         # single video with playlist.sxml URL in playlist param
 740         'url': 'http://www.bbc.com/sport/0/football/33653409',
 741         'info_dict': {
 742             'id': 'p02xycnp',
 743             'ext': 'mp4',
 744             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
 745             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
 746             'duration': 140,
 747         },
 748         'params': {
 749             # rtmp download
 750             'skip_download': True,
 751         }
 752     }, {
 753         # article with multiple videos embedded with playlist.sxml in playlist param
 754         'url': 'http://www.bbc.com/sport/0/football/34475836',
 755         'info_dict': {
 756             'id': '34475836',
 757             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
 758             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
 759         },
 760         'playlist_count': 3,
 761     }, {
 762         # school report article with single video
 763         'url': 'http://www.bbc.co.uk/schoolreport/35744779',
 764         'info_dict': {
 765             'id': '35744779',
 766             'title': 'School which breaks down barriers in Jerusalem',
 767         },
 768         'playlist_count': 1,
 769     }, {
 770         # single video with playlist URL from weather section
 771         'url': 'http://www.bbc.com/weather/features/33601775',
 772         'only_matching': True,
 773     }, {
 774         # custom redirection to www.bbc.com
 775         # also, video with window.__INITIAL_DATA__
 776         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
 777         'info_dict': {
 778             'id': 'p02xzws1',
 779             'ext': 'mp4',
 780             'title': "Pluto may have 'nitrogen glaciers'",
 781             'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
 782             'thumbnail': r're:https?://.+/.+\.jpg',
 783             'timestamp': 1437785037,
 784             'upload_date': '20150725',
 785         },
 786     }, {
 787         # video with window.__INITIAL_DATA__ and value as JSON string
 788         'url': 'https://www.bbc.com/news/av/world-europe-59468682',
 789         'info_dict': {
 790             'id': 'p0b71qth',
 791             'ext': 'mp4',
 792             'title': 'Why France is making this woman a national hero',
 793             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
 794             'thumbnail': r're:https?://.+/.+\.jpg',
 795             'timestamp': 1638230731,
 796             'upload_date': '20211130',
 797         },
 798     }, {
 799         # single video article embedded with data-media-vpid
 800         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
 801         'only_matching': True,
 802     }, {
 803         # bbcthreeConfig
 804         'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
 805         'info_dict': {
 806             'id': 'p06556y7',
 807             'ext': 'mp4',
 808             'title': 'Things Not To Say to people that live on council estates',
 809             'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
 810             'duration': 360,
 811             'thumbnail': r're:https?://.+/.+\.jpg',
 812         },
 813     }, {
 814         # window.__PRELOADED_STATE__
 815         'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
 816         'info_dict': {
 817             'id': 'b0b9z4vz',
 818             'ext': 'mp4',
 819             'title': 'Prom 6: An American in Paris and Turangalila',
 820             'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
 821             'uploader': 'Radio 3',
 822             'uploader_id': 'bbc_radio_three',
 823         },
 824     }, {
 825         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
 826         'info_dict': {
 827             'id': 'p06w9tws',
 828             'ext': 'mp4',
 829             'title': 'md5:2fabf12a726603193a2879a055f72514',
 830             'description': 'Learn English words and phrases from this story',
 831         },
 832         'add_ie': [BBCCoUkIE.ie_key()],
 833     }, {
 834         # BBC Reel
 835         'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
 836         'info_dict': {
 837             'id': 'p07c6sb9',
 838             'ext': 'mp4',
 839             'title': 'How positive thinking is harming your happiness',
 840             'alt_title': 'The downsides of positive thinking',
 841             'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
 842             'duration': 235,
 843             'thumbnail': r're:https?://.+/p07c9dsr.jpg',
 844             'upload_date': '20190604',
 845             'categories': ['Psychology'],
 846         },
 847     }]
 848
 849     @classmethod
 850     def suitable(cls, url):
 851         EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
 852         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
 853                 else super(BBCIE, cls).suitable(url))
 854
 855     def _extract_from_media_meta(self, media_meta, video_id):
 856         # Direct links to media in media metadata (e.g.
 857         # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 858         # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
 859         source_files = media_meta.get('sourceFiles')
 860         if source_files:
 861             return [{
 862                 'url': f['url'],
 863                 'format_id': format_id,
 864                 'ext': f.get('encoding'),
 865                 'tbr': float_or_none(f.get('bitrate'), 1000),
 866                 'filesize': int_or_none(f.get('filesize')),
 867             } for format_id, f in source_files.items() if f.get('url')], []
 868
 869         programme_id = media_meta.get('externalId')
 870         if programme_id:
 871             return self._download_media_selector(programme_id)
 872
 873         # Process playlist.sxml as legacy playlist
 874         href = media_meta.get('href')
 875         if href:
 876             playlist = self._download_legacy_playlist_url(href)
 877             _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
 878             return formats, subtitles
 879
 880         return [], []
 881
 882     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
 883         programme_id, title, description, duration, formats, subtitles = \
 884             self._process_legacy_playlist_url(url, playlist_id)
 885         self._sort_formats(formats)
 886         return {
 887             'id': programme_id,
 888             'title': title,
 889             'description': description,
 890             'duration': duration,
 891             'timestamp': timestamp,
 892             'formats': formats,
 893             'subtitles': subtitles,
 894         }
 895
 896     def _real_extract(self, url):
 897         playlist_id = self._match_id(url)
 898
 899         webpage = self._download_webpage(url, playlist_id)
 900
 901         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
 902         timestamp = json_ld_info.get('timestamp')
 903
 904         playlist_title = json_ld_info.get('title')
 905         if not playlist_title:
 906             playlist_title = (self._og_search_title(webpage, default=None)
 907                               or self._html_extract_title(webpage, 'playlist title', default=None))
 908             if playlist_title:
 909                 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
 910
 911         playlist_description = json_ld_info.get(
 912             'description') or self._og_search_description(webpage, default=None)
 913
 914         if not timestamp:
 915             timestamp = parse_iso8601(self._search_regex(
 916                 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
 917                  r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
 918                  r'"datePublished":\s*"([^"]+)'],
 919                 webpage, 'date', default=None))
 920
 921         entries = []
 922
 923         # article with multiple videos embedded with playlist.sxml (e.g.
 924         # http://www.bbc.com/sport/0/football/34475836)
 925         playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
 926         playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
 927         if playlists:
 928             entries = [
 929                 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
 930                 for playlist_url in playlists]
 931
 932         # news article with multiple videos embedded with data-playable
 933         data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
 934         if data_playables:
 935             for _, data_playable_json in data_playables:
 936                 data_playable = self._parse_json(
 937                     unescapeHTML(data_playable_json), playlist_id, fatal=False)
 938                 if not data_playable:
 939                     continue
 940                 settings = data_playable.get('settings', {})
 941                 if settings:
 942                     # data-playable with video vpid in settings.playlistObject.items (e.g.
 943                     # http://www.bbc.com/news/world-us-canada-34473351)
 944                     playlist_object = settings.get('playlistObject', {})
 945                     if playlist_object:
 946                         items = playlist_object.get('items')
 947                         if items and isinstance(items, list):
 948                             title = playlist_object['title']
 949                             description = playlist_object.get('summary')
 950                             duration = int_or_none(items[0].get('duration'))
 951                             programme_id = items[0].get('vpid')
 952                             formats, subtitles = self._download_media_selector(programme_id)
 953                             self._sort_formats(formats)
 954                             entries.append({
 955                                 'id': programme_id,
 956                                 'title': title,
 957                                 'description': description,
 958                                 'timestamp': timestamp,
 959                                 'duration': duration,
 960                                 'formats': formats,
 961                                 'subtitles': subtitles,
 962                             })
 963                     else:
 964                         # data-playable without vpid but with a playlist.sxml URLs
 965                         # in otherSettings.playlist (e.g.
 966                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
 967                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
 968                         if playlist:
 969                             entry = None
 970                             for key in ('streaming', 'progressiveDownload'):
 971                                 playlist_url = playlist.get('%sUrl' % key)
 972                                 if not playlist_url:
 973                                     continue
 974                                 try:
 975                                     info = self._extract_from_playlist_sxml(
 976                                         playlist_url, playlist_id, timestamp)
 977                                     if not entry:
 978                                         entry = info
 979                                     else:
 980                                         entry['title'] = info['title']
 981                                         entry['formats'].extend(info['formats'])
 982                                 except ExtractorError as e:
 983                                     # Some playlist URL may fail with 500, at the same time
 984                                     # the other one may work fine (e.g.
 985                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 986                                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
 987                                         continue
 988                                     raise
 989                             if entry:
 990                                 self._sort_formats(entry['formats'])
 991                                 entries.append(entry)
 992
 993         if entries:
 994             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
 995
 996         # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
 997         group_id = self._search_regex(
 998             r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
 999             webpage, 'group id', default=None)
1000         if group_id:
1001             return self.url_result(
1002                 'https://www.bbc.co.uk/programmes/%s' % group_id,
1003                 ie=BBCCoUkIE.ie_key())
1004
1005         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1006         programme_id = self._search_regex(
1007             [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1008              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1009              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1010             webpage, 'vpid', default=None)
1011
1012         if programme_id:
1013             formats, subtitles = self._download_media_selector(programme_id)
1014             self._sort_formats(formats)
1015             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1016             digital_data = self._parse_json(
1017                 self._search_regex(
1018                     r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1019                 programme_id, fatal=False)
1020             page_info = digital_data.get('page', {}).get('pageInfo', {})
1021             title = page_info.get('pageName') or self._og_search_title(webpage)
1022             description = page_info.get('description') or self._og_search_description(webpage)
1023             timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1024             return {
1025                 'id': programme_id,
1026                 'title': title,
1027                 'description': description,
1028                 'timestamp': timestamp,
1029                 'formats': formats,
1030                 'subtitles': subtitles,
1031             }
1032
1033         # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1034         initial_data = self._parse_json(self._html_search_regex(
1035             r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1036             webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1037         if initial_data:
1038             init_data = try_get(
1039                 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1040             smp_data = init_data.get('smpData') or {}
1041             clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1042             version_id = clip_data.get('versionID')
1043             if version_id:
1044                 title = smp_data['title']
1045                 formats, subtitles = self._download_media_selector(version_id)
1046                 self._sort_formats(formats)
1047                 image_url = smp_data.get('holdingImageURL')
1048                 display_date = init_data.get('displayDate')
1049                 topic_title = init_data.get('topicTitle')
1050
1051                 return {
1052                     'id': version_id,
1053                     'title': title,
1054                     'formats': formats,
1055                     'alt_title': init_data.get('shortTitle'),
1056                     'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1057                     'description': smp_data.get('summary') or init_data.get('shortSummary'),
1058                     'upload_date': display_date.replace('-', '') if display_date else None,
1059                     'subtitles': subtitles,
1060                     'duration': int_or_none(clip_data.get('duration')),
1061                     'categories': [topic_title] if topic_title else None,
1062                 }
1063
1064         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1065         # There are several setPayload calls may be present but the video
1066         # seems to be always related to the first one
1067         morph_payload = self._parse_json(
1068             self._search_regex(
1069                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1070                 webpage, 'morph payload', default='{}'),
1071             playlist_id, fatal=False)
1072         if morph_payload:
1073             components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1074             for component in components:
1075                 if not isinstance(component, dict):
1076                     continue
1077                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1078                 if not lead_media:
1079                     continue
1080                 identifiers = lead_media.get('identifiers')
1081                 if not identifiers or not isinstance(identifiers, dict):
1082                     continue
1083                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1084                 if not programme_id:
1085                     continue
1086                 title = lead_media.get('title') or self._og_search_title(webpage)
1087                 formats, subtitles = self._download_media_selector(programme_id)
1088                 self._sort_formats(formats)
1089                 description = lead_media.get('summary')
1090                 uploader = lead_media.get('masterBrand')
1091                 uploader_id = lead_media.get('mid')
1092                 duration = None
1093                 duration_d = lead_media.get('duration')
1094                 if isinstance(duration_d, dict):
1095                     duration = parse_duration(dict_get(
1096                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1097                 return {
1098                     'id': programme_id,
1099                     'title': title,
1100                     'description': description,
1101                     'duration': duration,
1102                     'uploader': uploader,
1103                     'uploader_id': uploader_id,
1104                     'formats': formats,
1105                     'subtitles': subtitles,
1106                 }
1107
1108         preload_state = self._parse_json(self._search_regex(
1109             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1110             'preload state', default='{}'), playlist_id, fatal=False)
1111         if preload_state:
1112             current_programme = preload_state.get('programmes', {}).get('current') or {}
1113             programme_id = current_programme.get('id')
1114             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1115                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1116                 formats, subtitles = self._download_media_selector(programme_id)
1117                 self._sort_formats(formats)
1118                 synopses = current_programme.get('synopses') or {}
1119                 network = current_programme.get('network') or {}
1120                 duration = int_or_none(
1121                     current_programme.get('duration', {}).get('value'))
1122                 thumbnail = None
1123                 image_url = current_programme.get('image_url')
1124                 if image_url:
1125                     thumbnail = image_url.replace('{recipe}', 'raw')
1126                 return {
1127                     'id': programme_id,
1128                     'title': title,
1129                     'description': dict_get(synopses, ('long', 'medium', 'short')),
1130                     'thumbnail': thumbnail,
1131                     'duration': duration,
1132                     'uploader': network.get('short_title'),
1133                     'uploader_id': network.get('id'),
1134                     'formats': formats,
1135                     'subtitles': subtitles,
1136                 }
1137
1138         bbc3_config = self._parse_json(
1139             self._search_regex(
1140                 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1141                 'bbcthree config', default='{}'),
1142             playlist_id, transform_source=js_to_json, fatal=False) or {}
1143         payload = bbc3_config.get('payload') or {}
1144         if payload:
1145             clip = payload.get('currentClip') or {}
1146             clip_vpid = clip.get('vpid')
1147             clip_title = clip.get('title')
1148             if clip_vpid and clip_title:
1149                 formats, subtitles = self._download_media_selector(clip_vpid)
1150                 self._sort_formats(formats)
1151                 return {
1152                     'id': clip_vpid,
1153                     'title': clip_title,
1154                     'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1155                     'description': clip.get('description'),
1156                     'duration': parse_duration(clip.get('duration')),
1157                     'formats': formats,
1158                     'subtitles': subtitles,
1159                 }
1160             bbc3_playlist = try_get(
1161                 payload, lambda x: x['content']['bbcMedia']['playlist'],
1162                 dict)
1163             if bbc3_playlist:
1164                 playlist_title = bbc3_playlist.get('title') or playlist_title
1165                 thumbnail = bbc3_playlist.get('holdingImageURL')
1166                 entries = []
1167                 for bbc3_item in bbc3_playlist['items']:
1168                     programme_id = bbc3_item.get('versionID')
1169                     if not programme_id:
1170                         continue
1171                     formats, subtitles = self._download_media_selector(programme_id)
1172                     self._sort_formats(formats)
1173                     entries.append({
1174                         'id': programme_id,
1175                         'title': playlist_title,
1176                         'thumbnail': thumbnail,
1177                         'timestamp': timestamp,
1178                         'formats': formats,
1179                         'subtitles': subtitles,
1180                     })
1181                 return self.playlist_result(
1182                     entries, playlist_id, playlist_title, playlist_description)
1183
1184         initial_data = self._search_regex(
1185             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1186             'quoted preload state', default=None)
1187         if initial_data is None:
1188             initial_data = self._search_regex(
1189                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1190                 'preload state', default={})
1191         else:
1192             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1193         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1194         if initial_data:
1195             def parse_media(media):
1196                 if not media:
1197                     return
1198                 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1199                     item_id = item.get('id')
1200                     item_title = item.get('title')
1201                     if not (item_id and item_title):
1202                         continue
1203                     formats, subtitles = self._download_media_selector(item_id)
1204                     self._sort_formats(formats)
1205                     item_desc = None
1206                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1207                     if blocks:
1208                         summary = []
1209                         for block in blocks:
1210                             text = try_get(block, lambda x: x['model']['text'], compat_str)
1211                             if text:
1212                                 summary.append(text)
1213                         if summary:
1214                             item_desc = '\n\n'.join(summary)
1215                     item_time = None
1216                     for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1217                         if try_get(meta, lambda x: x['label']) == 'Published':
1218                             item_time = unified_timestamp(meta.get('timestamp'))
1219                             break
1220                     entries.append({
1221                         'id': item_id,
1222                         'title': item_title,
1223                         'thumbnail': item.get('holdingImageUrl'),
1224                         'formats': formats,
1225                         'subtitles': subtitles,
1226                         'timestamp': item_time,
1227                         'description': strip_or_none(item_desc),
1228                     })
1229             for resp in (initial_data.get('data') or {}).values():
1230                 name = resp.get('name')
1231                 if name == 'media-experience':
1232                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1233                 elif name == 'article':
1234                     for block in (try_get(resp,
1235                                           (lambda x: x['data']['blocks'],
1236                                            lambda x: x['data']['content']['model']['blocks'],),
1237                                           list) or []):
1238                         if block.get('type') != 'media':
1239                             continue
1240                         parse_media(block.get('model'))
1241             return self.playlist_result(
1242                 entries, playlist_id, playlist_title, playlist_description)
1243
1244         def extract_all(pattern):
1245             return list(filter(None, map(
1246                 lambda s: self._parse_json(s, playlist_id, fatal=False),
1247                 re.findall(pattern, webpage))))
1248
1249         # Multiple video article (e.g.
1250         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1251         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1252         entries = []
1253         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1254             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1255             if embed_url and re.match(EMBED_URL, embed_url):
1256                 entries.append(embed_url)
1257         entries.extend(re.findall(
1258             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1259         if entries:
1260             return self.playlist_result(
1261                 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1262                 playlist_id, playlist_title, playlist_description)
1263
1264         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1265         medias = extract_all(r"data-media-meta='({[^']+})'")
1266
1267         if not medias:
1268             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1269             media_asset = self._search_regex(
1270                 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1271                 webpage, 'media asset', default=None)
1272             if media_asset:
1273                 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1274                 medias = []
1275                 for video in media_asset_page.get('videos', {}).values():
1276                     medias.extend(video.values())
1277
1278         if not medias:
1279             # Multiple video playlist with single `now playing` entry (e.g.
1280             # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1281             vxp_playlist = self._parse_json(
1282                 self._search_regex(
1283                     r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1284                     webpage, 'playlist data'),
1285                 playlist_id)
1286             playlist_medias = []
1287             for item in vxp_playlist:
1288                 media = item.get('media')
1289                 if not media:
1290                     continue
1291                 playlist_medias.append(media)
1292                 # Download single video if found media with asset id matching the video id from URL
1293                 if item.get('advert', {}).get('assetId') == playlist_id:
1294                     medias = [media]
1295                     break
1296             # Fallback to the whole playlist
1297             if not medias:
1298                 medias = playlist_medias
1299
1300         entries = []
1301         for num, media_meta in enumerate(medias, start=1):
1302             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1303             if not formats and not self.get_param('ignore_no_formats'):
1304                 continue
1305             self._sort_formats(formats)
1306
1307             video_id = media_meta.get('externalId')
1308             if not video_id:
1309                 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1310
1311             title = media_meta.get('caption')
1312             if not title:
1313                 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1314
1315             duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1316
1317             images = []
1318             for image in media_meta.get('images', {}).values():
1319                 images.extend(image.values())
1320             if 'image' in media_meta:
1321                 images.append(media_meta['image'])
1322
1323             thumbnails = [{
1324                 'url': image.get('href'),
1325                 'width': int_or_none(image.get('width')),
1326                 'height': int_or_none(image.get('height')),
1327             } for image in images]
1328
1329             entries.append({
1330                 'id': video_id,
1331                 'title': title,
1332                 'thumbnails': thumbnails,
1333                 'duration': duration,
1334                 'timestamp': timestamp,
1335                 'formats': formats,
1336                 'subtitles': subtitles,
1337             })
1338
1339         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1340
1341
1342 class BBCCoUkArticleIE(InfoExtractor):
1343     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1344     IE_NAME = 'bbc.co.uk:article'
1345     IE_DESC = 'BBC articles'
1346
1347     _TEST = {
1348         'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1349         'info_dict': {
1350             'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1351             'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1352             'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1353         },
1354         'playlist_count': 4,
1355         'add_ie': ['BBCCoUk'],
1356     }
1357
1358     def _real_extract(self, url):
1359         playlist_id = self._match_id(url)
1360
1361         webpage = self._download_webpage(url, playlist_id)
1362
1363         title = self._og_search_title(webpage)
1364         description = self._og_search_description(webpage).strip()
1365
1366         entries = [self.url_result(programme_url) for programme_url in re.findall(
1367             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1368
1369         return self.playlist_result(entries, playlist_id, title, description)
1370
1371
1372 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1373     def _entries(self, webpage, url, playlist_id):
1374         single_page = 'page' in compat_urlparse.parse_qs(
1375             compat_urlparse.urlparse(url).query)
1376         for page_num in itertools.count(2):
1377             for video_id in re.findall(
1378                     self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1379                 yield self.url_result(
1380                     self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1381             if single_page:
1382                 return
1383             next_page = self._search_regex(
1384                 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1385                 webpage, 'next page url', default=None, group='url')
1386             if not next_page:
1387                 break
1388             webpage = self._download_webpage(
1389                 compat_urlparse.urljoin(url, next_page), playlist_id,
1390                 'Downloading page %d' % page_num, page_num)
1391
1392     def _real_extract(self, url):
1393         playlist_id = self._match_id(url)
1394
1395         webpage = self._download_webpage(url, playlist_id)
1396
1397         title, description = self._extract_title_and_description(webpage)
1398
1399         return self.playlist_result(
1400             self._entries(webpage, url, playlist_id),
1401             playlist_id, title, description)
1402
1403
1404 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1405     _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1406
1407     @staticmethod
1408     def _get_default(episode, key, default_key='default'):
1409         return try_get(episode, lambda x: x[key][default_key])
1410
1411     def _get_description(self, data):
1412         synopsis = data.get(self._DESCRIPTION_KEY) or {}
1413         return dict_get(synopsis, ('large', 'medium', 'small'))
1414
1415     def _fetch_page(self, programme_id, per_page, series_id, page):
1416         elements = self._get_elements(self._call_api(
1417             programme_id, per_page, page + 1, series_id))
1418         for element in elements:
1419             episode = self._get_episode(element)
1420             episode_id = episode.get('id')
1421             if not episode_id:
1422                 continue
1423             thumbnail = None
1424             image = self._get_episode_image(episode)
1425             if image:
1426                 thumbnail = image.replace('{recipe}', 'raw')
1427             category = self._get_default(episode, 'labels', 'category')
1428             yield {
1429                 '_type': 'url',
1430                 'id': episode_id,
1431                 'title': self._get_episode_field(episode, 'subtitle'),
1432                 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1433                 'thumbnail': thumbnail,
1434                 'description': self._get_description(episode),
1435                 'categories': [category] if category else None,
1436                 'series': self._get_episode_field(episode, 'title'),
1437                 'ie_key': BBCCoUkIE.ie_key(),
1438             }
1439
1440     def _real_extract(self, url):
1441         pid = self._match_id(url)
1442         qs = parse_qs(url)
1443         series_id = qs.get('seriesId', [None])[0]
1444         page = qs.get('page', [None])[0]
1445         per_page = 36 if page else self._PAGE_SIZE
1446         fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1447         entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1448         playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1449         return self.playlist_result(
1450             entries, pid, self._get_playlist_title(playlist_data),
1451             self._get_description(playlist_data))
1452
1453
1454 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1455     IE_NAME = 'bbc.co.uk:iplayer:episodes'
1456     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1457     _TESTS = [{
1458         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1459         'info_dict': {
1460             'id': 'b05rcz9v',
1461             'title': 'The Disappearance',
1462             'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1463         },
1464         'playlist_mincount': 8,
1465     }, {
1466         # all seasons
1467         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1468         'info_dict': {
1469             'id': 'b094m5t9',
1470             'title': 'Doctor Foster',
1471             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1472         },
1473         'playlist_mincount': 10,
1474     }, {
1475         # explicit season
1476         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1477         'info_dict': {
1478             'id': 'b094m5t9',
1479             'title': 'Doctor Foster',
1480             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1481         },
1482         'playlist_mincount': 5,
1483     }, {
1484         # all pages
1485         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1486         'info_dict': {
1487             'id': 'm0004c4v',
1488             'title': 'Beechgrove',
1489             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1490         },
1491         'playlist_mincount': 37,
1492     }, {
1493         # explicit page
1494         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1495         'info_dict': {
1496             'id': 'm0004c4v',
1497             'title': 'Beechgrove',
1498             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1499         },
1500         'playlist_mincount': 1,
1501     }]
1502     _PAGE_SIZE = 100
1503     _DESCRIPTION_KEY = 'synopsis'
1504
1505     def _get_episode_image(self, episode):
1506         return self._get_default(episode, 'image')
1507
1508     def _get_episode_field(self, episode, field):
1509         return self._get_default(episode, field)
1510
1511     @staticmethod
1512     def _get_elements(data):
1513         return data['entities']['results']
1514
1515     @staticmethod
1516     def _get_episode(element):
1517         return element.get('episode') or {}
1518
1519     def _call_api(self, pid, per_page, page=1, series_id=None):
1520         variables = {
1521             'id': pid,
1522             'page': page,
1523             'perPage': per_page,
1524         }
1525         if series_id:
1526             variables['sliceId'] = series_id
1527         return self._download_json(
1528             'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1529                 'Content-Type': 'application/json'
1530             }, data=json.dumps({
1531                 'id': '5692d93d5aac8d796a0305e895e61551',
1532                 'variables': variables,
1533             }).encode('utf-8'))['data']['programme']
1534
1535     @staticmethod
1536     def _get_playlist_data(data):
1537         return data
1538
1539     def _get_playlist_title(self, data):
1540         return self._get_default(data, 'title')
1541
1542
1543 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1544     IE_NAME = 'bbc.co.uk:iplayer:group'
1545     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1546     _TESTS = [{
1547         # Available for over a year unlike 30 days for most other programmes
1548         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1549         'info_dict': {
1550             'id': 'p02tcc32',
1551             'title': 'Bohemian Icons',
1552             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1553         },
1554         'playlist_mincount': 10,
1555     }, {
1556         # all pages
1557         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1558         'info_dict': {
1559             'id': 'p081d7j7',
1560             'title': 'Music in Scotland',
1561             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1562         },
1563         'playlist_mincount': 47,
1564     }, {
1565         # explicit page
1566         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1567         'info_dict': {
1568             'id': 'p081d7j7',
1569             'title': 'Music in Scotland',
1570             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1571         },
1572         'playlist_mincount': 11,
1573     }]
1574     _PAGE_SIZE = 200
1575     _DESCRIPTION_KEY = 'synopses'
1576
1577     def _get_episode_image(self, episode):
1578         return self._get_default(episode, 'images', 'standard')
1579
1580     def _get_episode_field(self, episode, field):
1581         return episode.get(field)
1582
1583     @staticmethod
1584     def _get_elements(data):
1585         return data['elements']
1586
1587     @staticmethod
1588     def _get_episode(element):
1589         return element
1590
1591     def _call_api(self, pid, per_page, page=1, series_id=None):
1592         return self._download_json(
1593             'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1594             pid, query={
1595                 'page': page,
1596                 'per_page': per_page,
1597             })['group_episodes']
1598
1599     @staticmethod
1600     def _get_playlist_data(data):
1601         return data['group']
1602
1603     def _get_playlist_title(self, data):
1604         return data.get('title')
1605
1606
1607 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1608     IE_NAME = 'bbc.co.uk:playlist'
1609     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1610     _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1611     _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1612     _TESTS = [{
1613         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1614         'info_dict': {
1615             'id': 'b05rcz9v',
1616             'title': 'The Disappearance - Clips - BBC Four',
1617             'description': 'French thriller serial about a missing teenager.',
1618         },
1619         'playlist_mincount': 7,
1620     }, {
1621         # multipage playlist, explicit page
1622         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1623         'info_dict': {
1624             'id': 'b00mfl7n',
1625             'title': 'Frozen Planet - Clips - BBC One',
1626             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1627         },
1628         'playlist_mincount': 24,
1629     }, {
1630         # multipage playlist, all pages
1631         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1632         'info_dict': {
1633             'id': 'b00mfl7n',
1634             'title': 'Frozen Planet - Clips - BBC One',
1635             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1636         },
1637         'playlist_mincount': 142,
1638     }, {
1639         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1640         'only_matching': True,
1641     }, {
1642         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1643         'only_matching': True,
1644     }, {
1645         'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1646         'only_matching': True,
1647     }]
1648
1649     def _extract_title_and_description(self, webpage):
1650         title = self._og_search_title(webpage, fatal=False)
1651         description = self._og_search_description(webpage)
1652         return title, description