yt_dlp/extractor/bbc.py

   1 import functools
   2 import itertools
   3 import json
   4 import re
   5 import urllib.error
   6 import xml.etree.ElementTree
   7
   8 from .common import InfoExtractor
   9 from ..compat import compat_HTTPError, compat_str, compat_urlparse
  10 from ..utils import (
  11     ExtractorError,
  12     OnDemandPagedList,
  13     clean_html,
  14     dict_get,
  15     float_or_none,
  16     get_element_by_class,
  17     int_or_none,
  18     js_to_json,
  19     parse_duration,
  20     parse_iso8601,
  21     parse_qs,
  22     strip_or_none,
  23     try_get,
  24     unescapeHTML,
  25     unified_timestamp,
  26     url_or_none,
  27     urlencode_postdata,
  28     urljoin,
  29 )
  30
  31
  32 class BBCCoUkIE(InfoExtractor):
  33     IE_NAME = 'bbc.co.uk'
  34     IE_DESC = 'BBC iPlayer'
  35     _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  36     _VALID_URL = r'''(?x)
  37                     https?://
  38                         (?:www\.)?bbc\.co\.uk/
  39                         (?:
  40                             programmes/(?!articles/)|
  41                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  42                             music/(?:clips|audiovideo/popular)[/#]|
  43                             radio/player/|
  44                             sounds/play/|
  45                             events/[^/]+/play/[^/]+/
  46                         )
  47                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  48                     ''' % _ID_REGEX
  49
  50     _LOGIN_URL = 'https://account.bbc.com/signin'
  51     _NETRC_MACHINE = 'bbc'
  52
  53     _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  54     _MEDIA_SETS = [
  55         # Provides HQ HLS streams with even better quality that pc mediaset but fails
  56         # with geolocation in some cases when it's even not geo restricted at all (e.g.
  57         # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  58         'iptv-all',
  59         'pc',
  60     ]
  61
  62     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  63
  64     _TESTS = [
  65         {
  66             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  67             'info_dict': {
  68                 'id': 'b039d07m',
  69                 'ext': 'flv',
  70                 'title': 'Kaleidoscope, Leonard Cohen',
  71                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  72             },
  73             'params': {
  74                 # rtmp download
  75                 'skip_download': True,
  76             }
  77         },
  78         {
  79             'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  80             'info_dict': {
  81                 'id': 'b00yng1d',
  82                 'ext': 'flv',
  83                 'title': 'The Man in Black: Series 3: The Printed Name',
  84                 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  85                 'duration': 1800,
  86             },
  87             'params': {
  88                 # rtmp download
  89                 'skip_download': True,
  90             },
  91             'skip': 'Episode is no longer available on BBC iPlayer Radio',
  92         },
  93         {
  94             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  95             'info_dict': {
  96                 'id': 'b00yng1d',
  97                 'ext': 'flv',
  98                 'title': 'The Voice UK: Series 3: Blind Auditions 5',
  99                 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
 100                 'duration': 5100,
 101             },
 102             'params': {
 103                 # rtmp download
 104                 'skip_download': True,
 105             },
 106             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 107         },
 108         {
 109             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
 110             'info_dict': {
 111                 'id': 'b03k3pb7',
 112                 'ext': 'flv',
 113                 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
 114                 'description': '2. Invasion',
 115                 'duration': 3600,
 116             },
 117             'params': {
 118                 # rtmp download
 119                 'skip_download': True,
 120             },
 121             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 122         }, {
 123             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
 124             'info_dict': {
 125                 'id': 'b04v209v',
 126                 'ext': 'flv',
 127                 'title': 'Pete Tong, The Essential New Tune Special',
 128                 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
 129                 'duration': 10800,
 130             },
 131             'params': {
 132                 # rtmp download
 133                 'skip_download': True,
 134             },
 135             'skip': 'Episode is no longer available on BBC iPlayer Radio',
 136         }, {
 137             'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
 138             'note': 'Audio',
 139             'info_dict': {
 140                 'id': 'p022h44j',
 141                 'ext': 'flv',
 142                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
 143                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
 144                 'duration': 227,
 145             },
 146             'params': {
 147                 # rtmp download
 148                 'skip_download': True,
 149             }
 150         }, {
 151             'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
 152             'note': 'Video',
 153             'info_dict': {
 154                 'id': 'p025c103',
 155                 'ext': 'flv',
 156                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
 157                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
 158                 'duration': 226,
 159             },
 160             'params': {
 161                 # rtmp download
 162                 'skip_download': True,
 163             }
 164         }, {
 165             'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
 166             'info_dict': {
 167                 'id': 'p02n76xf',
 168                 'ext': 'flv',
 169                 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
 170                 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
 171                 'duration': 3540,
 172             },
 173             'params': {
 174                 # rtmp download
 175                 'skip_download': True,
 176             },
 177             'skip': 'geolocation',
 178         }, {
 179             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
 180             'info_dict': {
 181                 'id': 'b05zmgw1',
 182                 'ext': 'flv',
 183                 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
 184                 'title': 'Royal Academy Summer Exhibition',
 185                 'duration': 3540,
 186             },
 187             'params': {
 188                 # rtmp download
 189                 'skip_download': True,
 190             },
 191             'skip': 'geolocation',
 192         }, {
 193             # iptv-all mediaset fails with geolocation however there is no geo restriction
 194             # for this programme at all
 195             'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
 196             'info_dict': {
 197                 'id': 'b06rkms3',
 198                 'ext': 'flv',
 199                 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
 200                 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
 201             },
 202             'params': {
 203                 # rtmp download
 204                 'skip_download': True,
 205             },
 206             'skip': 'Now it\'s really geo-restricted',
 207         }, {
 208             # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
 209             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
 210             'info_dict': {
 211                 'id': 'p028bfkj',
 212                 'ext': 'flv',
 213                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 214                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 215             },
 216             'params': {
 217                 # rtmp download
 218                 'skip_download': True,
 219             },
 220         }, {
 221             'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
 222             'note': 'Audio',
 223             'info_dict': {
 224                 'id': 'm0007jz9',
 225                 'ext': 'mp4',
 226                 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
 227                 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
 228                 'duration': 9840,
 229             },
 230             'params': {
 231                 # rtmp download
 232                 'skip_download': True,
 233             }
 234         }, {
 235             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
 236             'only_matching': True,
 237         }, {
 238             'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
 239             'only_matching': True,
 240         }, {
 241             'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
 242             'only_matching': True,
 243         }, {
 244             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
 245             'only_matching': True,
 246         }, {
 247             'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
 248             'only_matching': True,
 249         }, {
 250             'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
 251             'only_matching': True,
 252         }, {
 253             'url': 'https://www.bbc.co.uk/programmes/m00005xn',
 254             'only_matching': True,
 255         }, {
 256             'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
 257             'only_matching': True,
 258         }]
 259
 260     def _perform_login(self, username, password):
 261         login_page = self._download_webpage(
 262             self._LOGIN_URL, None, 'Downloading signin page')
 263
 264         login_form = self._hidden_inputs(login_page)
 265
 266         login_form.update({
 267             'username': username,
 268             'password': password,
 269         })
 270
 271         post_url = urljoin(self._LOGIN_URL, self._search_regex(
 272             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
 273             'post url', default=self._LOGIN_URL, group='url'))
 274
 275         response, urlh = self._download_webpage_handle(
 276             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
 277             headers={'Referer': self._LOGIN_URL})
 278
 279         if self._LOGIN_URL in urlh.geturl():
 280             error = clean_html(get_element_by_class('form-message', response))
 281             if error:
 282                 raise ExtractorError(
 283                     'Unable to login: %s' % error, expected=True)
 284             raise ExtractorError('Unable to log in')
 285
 286     class MediaSelectionError(Exception):
 287         def __init__(self, id):
 288             self.id = id
 289
 290     def _extract_asx_playlist(self, connection, programme_id):
 291         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
 292         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 293
 294     def _extract_items(self, playlist):
 295         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 296
 297     def _extract_medias(self, media_selection):
 298         error = media_selection.get('result')
 299         if error:
 300             raise BBCCoUkIE.MediaSelectionError(error)
 301         return media_selection.get('media') or []
 302
 303     def _extract_connections(self, media):
 304         return media.get('connection') or []
 305
 306     def _get_subtitles(self, media, programme_id):
 307         subtitles = {}
 308         for connection in self._extract_connections(media):
 309             cc_url = url_or_none(connection.get('href'))
 310             if not cc_url:
 311                 continue
 312             captions = self._download_xml(
 313                 cc_url, programme_id, 'Downloading captions', fatal=False)
 314             if not isinstance(captions, xml.etree.ElementTree.Element):
 315                 continue
 316             subtitles['en'] = [
 317                 {
 318                     'url': connection.get('href'),
 319                     'ext': 'ttml',
 320                 },
 321             ]
 322             break
 323         return subtitles
 324
 325     def _raise_extractor_error(self, media_selection_error):
 326         raise ExtractorError(
 327             '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
 328             expected=True)
 329
 330     def _download_media_selector(self, programme_id):
 331         last_exception = None
 332         for media_set in self._MEDIA_SETS:
 333             try:
 334                 return self._download_media_selector_url(
 335                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
 336             except BBCCoUkIE.MediaSelectionError as e:
 337                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
 338                     last_exception = e
 339                     continue
 340                 self._raise_extractor_error(e)
 341         self._raise_extractor_error(last_exception)
 342
 343     def _download_media_selector_url(self, url, programme_id=None):
 344         media_selection = self._download_json(
 345             url, programme_id, 'Downloading media selection JSON',
 346             expected_status=(403, 404))
 347         return self._process_media_selector(media_selection, programme_id)
 348
 349     def _process_media_selector(self, media_selection, programme_id):
 350         formats = []
 351         subtitles = None
 352         urls = []
 353
 354         for media in self._extract_medias(media_selection):
 355             kind = media.get('kind')
 356             if kind in ('video', 'audio'):
 357                 bitrate = int_or_none(media.get('bitrate'))
 358                 encoding = media.get('encoding')
 359                 width = int_or_none(media.get('width'))
 360                 height = int_or_none(media.get('height'))
 361                 file_size = int_or_none(media.get('media_file_size'))
 362                 for connection in self._extract_connections(media):
 363                     href = connection.get('href')
 364                     if href in urls:
 365                         continue
 366                     if href:
 367                         urls.append(href)
 368                     conn_kind = connection.get('kind')
 369                     protocol = connection.get('protocol')
 370                     supplier = connection.get('supplier')
 371                     transfer_format = connection.get('transferFormat')
 372                     format_id = supplier or conn_kind or protocol
 373                     # ASX playlist
 374                     if supplier == 'asx':
 375                         for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
 376                             formats.append({
 377                                 'url': ref,
 378                                 'format_id': 'ref%s_%s' % (i, format_id),
 379                             })
 380                     elif transfer_format == 'dash':
 381                         formats.extend(self._extract_mpd_formats(
 382                             href, programme_id, mpd_id=format_id, fatal=False))
 383                     elif transfer_format == 'hls':
 384                         # TODO: let expected_status be passed into _extract_xxx_formats() instead
 385                         try:
 386                             fmts = self._extract_m3u8_formats(
 387                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
 388                                 m3u8_id=format_id, fatal=False)
 389                         except ExtractorError as e:
 390                             if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
 391                                     and e.exc_info[1].code in (403, 404)):
 392                                 raise
 393                             fmts = []
 394                         formats.extend(fmts)
 395                     elif transfer_format == 'hds':
 396                         formats.extend(self._extract_f4m_formats(
 397                             href, programme_id, f4m_id=format_id, fatal=False))
 398                     else:
 399                         if not supplier and bitrate:
 400                             format_id += '-%d' % bitrate
 401                         fmt = {
 402                             'format_id': format_id,
 403                             'filesize': file_size,
 404                         }
 405                         if kind == 'video':
 406                             fmt.update({
 407                                 'width': width,
 408                                 'height': height,
 409                                 'tbr': bitrate,
 410                                 'vcodec': encoding,
 411                             })
 412                         else:
 413                             fmt.update({
 414                                 'abr': bitrate,
 415                                 'acodec': encoding,
 416                                 'vcodec': 'none',
 417                             })
 418                         if protocol in ('http', 'https'):
 419                             # Direct link
 420                             fmt.update({
 421                                 'url': href,
 422                             })
 423                         elif protocol == 'rtmp':
 424                             application = connection.get('application', 'ondemand')
 425                             auth_string = connection.get('authString')
 426                             identifier = connection.get('identifier')
 427                             server = connection.get('server')
 428                             fmt.update({
 429                                 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
 430                                 'play_path': identifier,
 431                                 'app': '%s?%s' % (application, auth_string),
 432                                 'page_url': 'http://www.bbc.co.uk',
 433                                 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
 434                                 'rtmp_live': False,
 435                                 'ext': 'flv',
 436                             })
 437                         else:
 438                             continue
 439                         formats.append(fmt)
 440             elif kind == 'captions':
 441                 subtitles = self.extract_subtitles(media, programme_id)
 442         return formats, subtitles
 443
 444     def _download_playlist(self, playlist_id):
 445         try:
 446             playlist = self._download_json(
 447                 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
 448                 playlist_id, 'Downloading playlist JSON')
 449             formats = []
 450             subtitles = {}
 451
 452             for version in playlist.get('allAvailableVersions', []):
 453                 smp_config = version['smpConfig']
 454                 title = smp_config['title']
 455                 description = smp_config['summary']
 456                 for item in smp_config['items']:
 457                     kind = item['kind']
 458                     if kind not in ('programme', 'radioProgramme'):
 459                         continue
 460                     programme_id = item.get('vpid')
 461                     duration = int_or_none(item.get('duration'))
 462                     version_formats, version_subtitles = self._download_media_selector(programme_id)
 463                     types = version['types']
 464                     for f in version_formats:
 465                         f['format_note'] = ', '.join(types)
 466                         if any('AudioDescribed' in x for x in types):
 467                             f['language_preference'] = -10
 468                     formats += version_formats
 469                     for tag, subformats in (version_subtitles or {}).items():
 470                         subtitles.setdefault(tag, []).extend(subformats)
 471
 472             return programme_id, title, description, duration, formats, subtitles
 473         except ExtractorError as ee:
 474             if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
 475                 raise
 476
 477         # fallback to legacy playlist
 478         return self._process_legacy_playlist(playlist_id)
 479
 480     def _process_legacy_playlist_url(self, url, display_id):
 481         playlist = self._download_legacy_playlist_url(url, display_id)
 482         return self._extract_from_legacy_playlist(playlist, display_id)
 483
 484     def _process_legacy_playlist(self, playlist_id):
 485         return self._process_legacy_playlist_url(
 486             'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
 487
 488     def _download_legacy_playlist_url(self, url, playlist_id=None):
 489         return self._download_xml(
 490             url, playlist_id, 'Downloading legacy playlist XML')
 491
 492     def _extract_from_legacy_playlist(self, playlist, playlist_id):
 493         no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
 494         if no_items is not None:
 495             reason = no_items.get('reason')
 496             if reason == 'preAvailability':
 497                 msg = 'Episode %s is not yet available' % playlist_id
 498             elif reason == 'postAvailability':
 499                 msg = 'Episode %s is no longer available' % playlist_id
 500             elif reason == 'noMedia':
 501                 msg = 'Episode %s is not currently available' % playlist_id
 502             else:
 503                 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
 504             raise ExtractorError(msg, expected=True)
 505
 506         for item in self._extract_items(playlist):
 507             kind = item.get('kind')
 508             if kind not in ('programme', 'radioProgramme'):
 509                 continue
 510             title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
 511             description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
 512             description = description_el.text if description_el is not None else None
 513
 514             def get_programme_id(item):
 515                 def get_from_attributes(item):
 516                     for p in ('identifier', 'group'):
 517                         value = item.get(p)
 518                         if value and re.match(r'^[pb][\da-z]{7}$', value):
 519                             return value
 520                 get_from_attributes(item)
 521                 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
 522                 if mediator is not None:
 523                     return get_from_attributes(mediator)
 524
 525             programme_id = get_programme_id(item)
 526             duration = int_or_none(item.get('duration'))
 527
 528             if programme_id:
 529                 formats, subtitles = self._download_media_selector(programme_id)
 530             else:
 531                 formats, subtitles = self._process_media_selector(item, playlist_id)
 532                 programme_id = playlist_id
 533
 534         return programme_id, title, description, duration, formats, subtitles
 535
 536     def _real_extract(self, url):
 537         group_id = self._match_id(url)
 538
 539         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 540
 541         error = self._search_regex(
 542             r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
 543             webpage, 'error', default=None)
 544         if error:
 545             raise ExtractorError(error, expected=True)
 546
 547         programme_id = None
 548         duration = None
 549
 550         tviplayer = self._search_regex(
 551             r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
 552             webpage, 'player', default=None)
 553
 554         if tviplayer:
 555             player = self._parse_json(tviplayer, group_id).get('player', {})
 556             duration = int_or_none(player.get('duration'))
 557             programme_id = player.get('vpid')
 558
 559         if not programme_id:
 560             programme_id = self._search_regex(
 561                 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 562
 563         if programme_id:
 564             formats, subtitles = self._download_media_selector(programme_id)
 565             title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 566                 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
 567                  r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
 568             description = self._search_regex(
 569                 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
 570                  r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
 571                 webpage, 'description', default=None)
 572             if not description:
 573                 description = self._html_search_meta('description', webpage)
 574         else:
 575             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 576
 577         self._sort_formats(formats)
 578
 579         return {
 580             'id': programme_id,
 581             'title': title,
 582             'description': description,
 583             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 584             'duration': duration,
 585             'formats': formats,
 586             'subtitles': subtitles,
 587         }
 588
 589
 590 class BBCIE(BBCCoUkIE):
 591     IE_NAME = 'bbc'
 592     IE_DESC = 'BBC'
 593     _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
 594
 595     _MEDIA_SETS = [
 596         'pc',
 597         'mobile-tablet-main',
 598     ]
 599
 600     _TESTS = [{
 601         # article with multiple videos embedded with data-playable containing vpids
 602         'url': 'http://www.bbc.com/news/world-europe-32668511',
 603         'info_dict': {
 604             'id': 'world-europe-32668511',
 605             'title': 'Russia stages massive WW2 parade',
 606             'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
 607         },
 608         'playlist_count': 2,
 609     }, {
 610         # article with multiple videos embedded with data-playable (more videos)
 611         'url': 'http://www.bbc.com/news/business-28299555',
 612         'info_dict': {
 613             'id': 'business-28299555',
 614             'title': 'Farnborough Airshow: Video highlights',
 615             'description': 'BBC reports and video highlights at the Farnborough Airshow.',
 616         },
 617         'playlist_count': 9,
 618         'skip': 'Save time',
 619     }, {
 620         # article with multiple videos embedded with `new SMP()`
 621         # broken
 622         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
 623         'info_dict': {
 624             'id': '3662a707-0af9-3149-963f-47bea720b460',
 625             'title': 'BUGGER',
 626         },
 627         'playlist_count': 18,
 628     }, {
 629         # single video embedded with data-playable containing vpid
 630         'url': 'http://www.bbc.com/news/world-europe-32041533',
 631         'info_dict': {
 632             'id': 'p02mprgb',
 633             'ext': 'mp4',
 634             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 635             'description': 'md5:2868290467291b37feda7863f7a83f54',
 636             'duration': 47,
 637             'timestamp': 1427219242,
 638             'upload_date': '20150324',
 639         },
 640         'params': {
 641             # rtmp download
 642             'skip_download': True,
 643         }
 644     }, {
 645         # article with single video embedded with data-playable containing XML playlist
 646         # with direct video links as progressiveDownloadUrl (for now these are extracted)
 647         # and playlist with f4m and m3u8 as streamingUrl
 648         'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 649         'info_dict': {
 650             'id': '150615_telabyad_kentin_cogu',
 651             'ext': 'mp4',
 652             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
 653             'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
 654             'timestamp': 1434397334,
 655             'upload_date': '20150615',
 656         },
 657         'params': {
 658             'skip_download': True,
 659         }
 660     }, {
 661         # single video embedded with data-playable containing XML playlists (regional section)
 662         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 663         'info_dict': {
 664             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
 665             'ext': 'mp4',
 666             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
 667             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
 668             'timestamp': 1434713142,
 669             'upload_date': '20150619',
 670         },
 671         'params': {
 672             'skip_download': True,
 673         }
 674     }, {
 675         # single video from video playlist embedded with vxp-playlist-data JSON
 676         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
 677         'info_dict': {
 678             'id': 'p02w6qjc',
 679             'ext': 'mp4',
 680             'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 681             'duration': 56,
 682             'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 683         },
 684         'params': {
 685             'skip_download': True,
 686         }
 687     }, {
 688         # single video story with digitalData
 689         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
 690         'info_dict': {
 691             'id': 'p02q6gc4',
 692             'ext': 'flv',
 693             'title': 'Sri Lanka’s spicy secret',
 694             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
 695             'timestamp': 1437674293,
 696             'upload_date': '20150723',
 697         },
 698         'params': {
 699             # rtmp download
 700             'skip_download': True,
 701         }
 702     }, {
 703         # single video story without digitalData
 704         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
 705         'info_dict': {
 706             'id': 'p018zqqg',
 707             'ext': 'mp4',
 708             'title': 'Hyundai Santa Fe Sport: Rock star',
 709             'description': 'md5:b042a26142c4154a6e472933cf20793d',
 710             'timestamp': 1415867444,
 711             'upload_date': '20141113',
 712         },
 713         'params': {
 714             # rtmp download
 715             'skip_download': True,
 716         }
 717     }, {
 718         # single video embedded with Morph
 719         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
 720         'info_dict': {
 721             'id': 'p041vhd0',
 722             'ext': 'mp4',
 723             'title': "Nigeria v Japan - Men's First Round",
 724             'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
 725             'duration': 7980,
 726             'uploader': 'BBC Sport',
 727             'uploader_id': 'bbc_sport',
 728         },
 729         'params': {
 730             # m3u8 download
 731             'skip_download': True,
 732         },
 733         'skip': 'Georestricted to UK',
 734     }, {
 735         # single video with playlist.sxml URL in playlist param
 736         'url': 'http://www.bbc.com/sport/0/football/33653409',
 737         'info_dict': {
 738             'id': 'p02xycnp',
 739             'ext': 'mp4',
 740             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
 741             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
 742             'duration': 140,
 743         },
 744         'params': {
 745             # rtmp download
 746             'skip_download': True,
 747         }
 748     }, {
 749         # article with multiple videos embedded with playlist.sxml in playlist param
 750         'url': 'http://www.bbc.com/sport/0/football/34475836',
 751         'info_dict': {
 752             'id': '34475836',
 753             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
 754             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
 755         },
 756         'playlist_count': 3,
 757     }, {
 758         # school report article with single video
 759         'url': 'http://www.bbc.co.uk/schoolreport/35744779',
 760         'info_dict': {
 761             'id': '35744779',
 762             'title': 'School which breaks down barriers in Jerusalem',
 763         },
 764         'playlist_count': 1,
 765     }, {
 766         # single video with playlist URL from weather section
 767         'url': 'http://www.bbc.com/weather/features/33601775',
 768         'only_matching': True,
 769     }, {
 770         # custom redirection to www.bbc.com
 771         # also, video with window.__INITIAL_DATA__
 772         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
 773         'info_dict': {
 774             'id': 'p02xzws1',
 775             'ext': 'mp4',
 776             'title': "Pluto may have 'nitrogen glaciers'",
 777             'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
 778             'thumbnail': r're:https?://.+/.+\.jpg',
 779             'timestamp': 1437785037,
 780             'upload_date': '20150725',
 781         },
 782     }, {
 783         # video with window.__INITIAL_DATA__ and value as JSON string
 784         'url': 'https://www.bbc.com/news/av/world-europe-59468682',
 785         'info_dict': {
 786             'id': 'p0b71qth',
 787             'ext': 'mp4',
 788             'title': 'Why France is making this woman a national hero',
 789             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
 790             'thumbnail': r're:https?://.+/.+\.jpg',
 791             'timestamp': 1638230731,
 792             'upload_date': '20211130',
 793         },
 794     }, {
 795         # single video article embedded with data-media-vpid
 796         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
 797         'only_matching': True,
 798     }, {
 799         # bbcthreeConfig
 800         'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
 801         'info_dict': {
 802             'id': 'p06556y7',
 803             'ext': 'mp4',
 804             'title': 'Things Not To Say to people that live on council estates',
 805             'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
 806             'duration': 360,
 807             'thumbnail': r're:https?://.+/.+\.jpg',
 808         },
 809     }, {
 810         # window.__PRELOADED_STATE__
 811         'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
 812         'info_dict': {
 813             'id': 'b0b9z4vz',
 814             'ext': 'mp4',
 815             'title': 'Prom 6: An American in Paris and Turangalila',
 816             'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
 817             'uploader': 'Radio 3',
 818             'uploader_id': 'bbc_radio_three',
 819         },
 820     }, {
 821         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
 822         'info_dict': {
 823             'id': 'p06w9tws',
 824             'ext': 'mp4',
 825             'title': 'md5:2fabf12a726603193a2879a055f72514',
 826             'description': 'Learn English words and phrases from this story',
 827         },
 828         'add_ie': [BBCCoUkIE.ie_key()],
 829     }, {
 830         # BBC Reel
 831         'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
 832         'info_dict': {
 833             'id': 'p07c6sb9',
 834             'ext': 'mp4',
 835             'title': 'How positive thinking is harming your happiness',
 836             'alt_title': 'The downsides of positive thinking',
 837             'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
 838             'duration': 235,
 839             'thumbnail': r're:https?://.+/p07c9dsr.jpg',
 840             'upload_date': '20190604',
 841             'categories': ['Psychology'],
 842         },
 843     }]
 844
 845     @classmethod
 846     def suitable(cls, url):
 847         EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
 848         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
 849                 else super(BBCIE, cls).suitable(url))
 850
 851     def _extract_from_media_meta(self, media_meta, video_id):
 852         # Direct links to media in media metadata (e.g.
 853         # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 854         # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
 855         source_files = media_meta.get('sourceFiles')
 856         if source_files:
 857             return [{
 858                 'url': f['url'],
 859                 'format_id': format_id,
 860                 'ext': f.get('encoding'),
 861                 'tbr': float_or_none(f.get('bitrate'), 1000),
 862                 'filesize': int_or_none(f.get('filesize')),
 863             } for format_id, f in source_files.items() if f.get('url')], []
 864
 865         programme_id = media_meta.get('externalId')
 866         if programme_id:
 867             return self._download_media_selector(programme_id)
 868
 869         # Process playlist.sxml as legacy playlist
 870         href = media_meta.get('href')
 871         if href:
 872             playlist = self._download_legacy_playlist_url(href)
 873             _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
 874             return formats, subtitles
 875
 876         return [], []
 877
 878     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
 879         programme_id, title, description, duration, formats, subtitles = \
 880             self._process_legacy_playlist_url(url, playlist_id)
 881         self._sort_formats(formats)
 882         return {
 883             'id': programme_id,
 884             'title': title,
 885             'description': description,
 886             'duration': duration,
 887             'timestamp': timestamp,
 888             'formats': formats,
 889             'subtitles': subtitles,
 890         }
 891
 892     def _real_extract(self, url):
 893         playlist_id = self._match_id(url)
 894
 895         webpage = self._download_webpage(url, playlist_id)
 896
 897         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
 898         timestamp = json_ld_info.get('timestamp')
 899
 900         playlist_title = json_ld_info.get('title')
 901         if not playlist_title:
 902             playlist_title = (self._og_search_title(webpage, default=None)
 903                               or self._html_extract_title(webpage, 'playlist title', default=None))
 904             if playlist_title:
 905                 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
 906
 907         playlist_description = json_ld_info.get(
 908             'description') or self._og_search_description(webpage, default=None)
 909
 910         if not timestamp:
 911             timestamp = parse_iso8601(self._search_regex(
 912                 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
 913                  r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
 914                  r'"datePublished":\s*"([^"]+)'],
 915                 webpage, 'date', default=None))
 916
 917         entries = []
 918
 919         # article with multiple videos embedded with playlist.sxml (e.g.
 920         # http://www.bbc.com/sport/0/football/34475836)
 921         playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
 922         playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
 923         if playlists:
 924             entries = [
 925                 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
 926                 for playlist_url in playlists]
 927
 928         # news article with multiple videos embedded with data-playable
 929         data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
 930         if data_playables:
 931             for _, data_playable_json in data_playables:
 932                 data_playable = self._parse_json(
 933                     unescapeHTML(data_playable_json), playlist_id, fatal=False)
 934                 if not data_playable:
 935                     continue
 936                 settings = data_playable.get('settings', {})
 937                 if settings:
 938                     # data-playable with video vpid in settings.playlistObject.items (e.g.
 939                     # http://www.bbc.com/news/world-us-canada-34473351)
 940                     playlist_object = settings.get('playlistObject', {})
 941                     if playlist_object:
 942                         items = playlist_object.get('items')
 943                         if items and isinstance(items, list):
 944                             title = playlist_object['title']
 945                             description = playlist_object.get('summary')
 946                             duration = int_or_none(items[0].get('duration'))
 947                             programme_id = items[0].get('vpid')
 948                             formats, subtitles = self._download_media_selector(programme_id)
 949                             self._sort_formats(formats)
 950                             entries.append({
 951                                 'id': programme_id,
 952                                 'title': title,
 953                                 'description': description,
 954                                 'timestamp': timestamp,
 955                                 'duration': duration,
 956                                 'formats': formats,
 957                                 'subtitles': subtitles,
 958                             })
 959                     else:
 960                         # data-playable without vpid but with a playlist.sxml URLs
 961                         # in otherSettings.playlist (e.g.
 962                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
 963                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
 964                         if playlist:
 965                             entry = None
 966                             for key in ('streaming', 'progressiveDownload'):
 967                                 playlist_url = playlist.get('%sUrl' % key)
 968                                 if not playlist_url:
 969                                     continue
 970                                 try:
 971                                     info = self._extract_from_playlist_sxml(
 972                                         playlist_url, playlist_id, timestamp)
 973                                     if not entry:
 974                                         entry = info
 975                                     else:
 976                                         entry['title'] = info['title']
 977                                         entry['formats'].extend(info['formats'])
 978                                 except ExtractorError as e:
 979                                     # Some playlist URL may fail with 500, at the same time
 980                                     # the other one may work fine (e.g.
 981                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 982                                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
 983                                         continue
 984                                     raise
 985                             if entry:
 986                                 self._sort_formats(entry['formats'])
 987                                 entries.append(entry)
 988
 989         if entries:
 990             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
 991
 992         # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
 993         group_id = self._search_regex(
 994             r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
 995             webpage, 'group id', default=None)
 996         if group_id:
 997             return self.url_result(
 998                 'https://www.bbc.co.uk/programmes/%s' % group_id,
 999                 ie=BBCCoUkIE.ie_key())
1000
1001         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1002         programme_id = self._search_regex(
1003             [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1004              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1005              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1006             webpage, 'vpid', default=None)
1007
1008         if programme_id:
1009             formats, subtitles = self._download_media_selector(programme_id)
1010             self._sort_formats(formats)
1011             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1012             digital_data = self._parse_json(
1013                 self._search_regex(
1014                     r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1015                 programme_id, fatal=False)
1016             page_info = digital_data.get('page', {}).get('pageInfo', {})
1017             title = page_info.get('pageName') or self._og_search_title(webpage)
1018             description = page_info.get('description') or self._og_search_description(webpage)
1019             timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1020             return {
1021                 'id': programme_id,
1022                 'title': title,
1023                 'description': description,
1024                 'timestamp': timestamp,
1025                 'formats': formats,
1026                 'subtitles': subtitles,
1027             }
1028
1029         # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1030         initial_data = self._parse_json(self._html_search_regex(
1031             r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1032             webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1033         if initial_data:
1034             init_data = try_get(
1035                 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1036             smp_data = init_data.get('smpData') or {}
1037             clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1038             version_id = clip_data.get('versionID')
1039             if version_id:
1040                 title = smp_data['title']
1041                 formats, subtitles = self._download_media_selector(version_id)
1042                 self._sort_formats(formats)
1043                 image_url = smp_data.get('holdingImageURL')
1044                 display_date = init_data.get('displayDate')
1045                 topic_title = init_data.get('topicTitle')
1046
1047                 return {
1048                     'id': version_id,
1049                     'title': title,
1050                     'formats': formats,
1051                     'alt_title': init_data.get('shortTitle'),
1052                     'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1053                     'description': smp_data.get('summary') or init_data.get('shortSummary'),
1054                     'upload_date': display_date.replace('-', '') if display_date else None,
1055                     'subtitles': subtitles,
1056                     'duration': int_or_none(clip_data.get('duration')),
1057                     'categories': [topic_title] if topic_title else None,
1058                 }
1059
1060         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1061         # There are several setPayload calls may be present but the video
1062         # seems to be always related to the first one
1063         morph_payload = self._parse_json(
1064             self._search_regex(
1065                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1066                 webpage, 'morph payload', default='{}'),
1067             playlist_id, fatal=False)
1068         if morph_payload:
1069             components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1070             for component in components:
1071                 if not isinstance(component, dict):
1072                     continue
1073                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1074                 if not lead_media:
1075                     continue
1076                 identifiers = lead_media.get('identifiers')
1077                 if not identifiers or not isinstance(identifiers, dict):
1078                     continue
1079                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1080                 if not programme_id:
1081                     continue
1082                 title = lead_media.get('title') or self._og_search_title(webpage)
1083                 formats, subtitles = self._download_media_selector(programme_id)
1084                 self._sort_formats(formats)
1085                 description = lead_media.get('summary')
1086                 uploader = lead_media.get('masterBrand')
1087                 uploader_id = lead_media.get('mid')
1088                 duration = None
1089                 duration_d = lead_media.get('duration')
1090                 if isinstance(duration_d, dict):
1091                     duration = parse_duration(dict_get(
1092                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1093                 return {
1094                     'id': programme_id,
1095                     'title': title,
1096                     'description': description,
1097                     'duration': duration,
1098                     'uploader': uploader,
1099                     'uploader_id': uploader_id,
1100                     'formats': formats,
1101                     'subtitles': subtitles,
1102                 }
1103
1104         preload_state = self._parse_json(self._search_regex(
1105             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1106             'preload state', default='{}'), playlist_id, fatal=False)
1107         if preload_state:
1108             current_programme = preload_state.get('programmes', {}).get('current') or {}
1109             programme_id = current_programme.get('id')
1110             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1111                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1112                 formats, subtitles = self._download_media_selector(programme_id)
1113                 self._sort_formats(formats)
1114                 synopses = current_programme.get('synopses') or {}
1115                 network = current_programme.get('network') or {}
1116                 duration = int_or_none(
1117                     current_programme.get('duration', {}).get('value'))
1118                 thumbnail = None
1119                 image_url = current_programme.get('image_url')
1120                 if image_url:
1121                     thumbnail = image_url.replace('{recipe}', 'raw')
1122                 return {
1123                     'id': programme_id,
1124                     'title': title,
1125                     'description': dict_get(synopses, ('long', 'medium', 'short')),
1126                     'thumbnail': thumbnail,
1127                     'duration': duration,
1128                     'uploader': network.get('short_title'),
1129                     'uploader_id': network.get('id'),
1130                     'formats': formats,
1131                     'subtitles': subtitles,
1132                 }
1133
1134         bbc3_config = self._parse_json(
1135             self._search_regex(
1136                 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1137                 'bbcthree config', default='{}'),
1138             playlist_id, transform_source=js_to_json, fatal=False) or {}
1139         payload = bbc3_config.get('payload') or {}
1140         if payload:
1141             clip = payload.get('currentClip') or {}
1142             clip_vpid = clip.get('vpid')
1143             clip_title = clip.get('title')
1144             if clip_vpid and clip_title:
1145                 formats, subtitles = self._download_media_selector(clip_vpid)
1146                 self._sort_formats(formats)
1147                 return {
1148                     'id': clip_vpid,
1149                     'title': clip_title,
1150                     'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1151                     'description': clip.get('description'),
1152                     'duration': parse_duration(clip.get('duration')),
1153                     'formats': formats,
1154                     'subtitles': subtitles,
1155                 }
1156             bbc3_playlist = try_get(
1157                 payload, lambda x: x['content']['bbcMedia']['playlist'],
1158                 dict)
1159             if bbc3_playlist:
1160                 playlist_title = bbc3_playlist.get('title') or playlist_title
1161                 thumbnail = bbc3_playlist.get('holdingImageURL')
1162                 entries = []
1163                 for bbc3_item in bbc3_playlist['items']:
1164                     programme_id = bbc3_item.get('versionID')
1165                     if not programme_id:
1166                         continue
1167                     formats, subtitles = self._download_media_selector(programme_id)
1168                     self._sort_formats(formats)
1169                     entries.append({
1170                         'id': programme_id,
1171                         'title': playlist_title,
1172                         'thumbnail': thumbnail,
1173                         'timestamp': timestamp,
1174                         'formats': formats,
1175                         'subtitles': subtitles,
1176                     })
1177                 return self.playlist_result(
1178                     entries, playlist_id, playlist_title, playlist_description)
1179
1180         initial_data = self._search_regex(
1181             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1182             'quoted preload state', default=None)
1183         if initial_data is None:
1184             initial_data = self._search_regex(
1185                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1186                 'preload state', default={})
1187         else:
1188             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1189         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1190         if initial_data:
1191             def parse_media(media):
1192                 if not media:
1193                     return
1194                 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1195                     item_id = item.get('id')
1196                     item_title = item.get('title')
1197                     if not (item_id and item_title):
1198                         continue
1199                     formats, subtitles = self._download_media_selector(item_id)
1200                     self._sort_formats(formats)
1201                     item_desc = None
1202                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1203                     if blocks:
1204                         summary = []
1205                         for block in blocks:
1206                             text = try_get(block, lambda x: x['model']['text'], compat_str)
1207                             if text:
1208                                 summary.append(text)
1209                         if summary:
1210                             item_desc = '\n\n'.join(summary)
1211                     item_time = None
1212                     for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1213                         if try_get(meta, lambda x: x['label']) == 'Published':
1214                             item_time = unified_timestamp(meta.get('timestamp'))
1215                             break
1216                     entries.append({
1217                         'id': item_id,
1218                         'title': item_title,
1219                         'thumbnail': item.get('holdingImageUrl'),
1220                         'formats': formats,
1221                         'subtitles': subtitles,
1222                         'timestamp': item_time,
1223                         'description': strip_or_none(item_desc),
1224                     })
1225             for resp in (initial_data.get('data') or {}).values():
1226                 name = resp.get('name')
1227                 if name == 'media-experience':
1228                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1229                 elif name == 'article':
1230                     for block in (try_get(resp,
1231                                           (lambda x: x['data']['blocks'],
1232                                            lambda x: x['data']['content']['model']['blocks'],),
1233                                           list) or []):
1234                         if block.get('type') != 'media':
1235                             continue
1236                         parse_media(block.get('model'))
1237             return self.playlist_result(
1238                 entries, playlist_id, playlist_title, playlist_description)
1239
1240         def extract_all(pattern):
1241             return list(filter(None, map(
1242                 lambda s: self._parse_json(s, playlist_id, fatal=False),
1243                 re.findall(pattern, webpage))))
1244
1245         # Multiple video article (e.g.
1246         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1247         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1248         entries = []
1249         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1250             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1251             if embed_url and re.match(EMBED_URL, embed_url):
1252                 entries.append(embed_url)
1253         entries.extend(re.findall(
1254             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1255         if entries:
1256             return self.playlist_result(
1257                 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1258                 playlist_id, playlist_title, playlist_description)
1259
1260         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1261         medias = extract_all(r"data-media-meta='({[^']+})'")
1262
1263         if not medias:
1264             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1265             media_asset = self._search_regex(
1266                 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1267                 webpage, 'media asset', default=None)
1268             if media_asset:
1269                 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1270                 medias = []
1271                 for video in media_asset_page.get('videos', {}).values():
1272                     medias.extend(video.values())
1273
1274         if not medias:
1275             # Multiple video playlist with single `now playing` entry (e.g.
1276             # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1277             vxp_playlist = self._parse_json(
1278                 self._search_regex(
1279                     r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1280                     webpage, 'playlist data'),
1281                 playlist_id)
1282             playlist_medias = []
1283             for item in vxp_playlist:
1284                 media = item.get('media')
1285                 if not media:
1286                     continue
1287                 playlist_medias.append(media)
1288                 # Download single video if found media with asset id matching the video id from URL
1289                 if item.get('advert', {}).get('assetId') == playlist_id:
1290                     medias = [media]
1291                     break
1292             # Fallback to the whole playlist
1293             if not medias:
1294                 medias = playlist_medias
1295
1296         entries = []
1297         for num, media_meta in enumerate(medias, start=1):
1298             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1299             if not formats and not self.get_param('ignore_no_formats'):
1300                 continue
1301             self._sort_formats(formats)
1302
1303             video_id = media_meta.get('externalId')
1304             if not video_id:
1305                 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1306
1307             title = media_meta.get('caption')
1308             if not title:
1309                 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1310
1311             duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1312
1313             images = []
1314             for image in media_meta.get('images', {}).values():
1315                 images.extend(image.values())
1316             if 'image' in media_meta:
1317                 images.append(media_meta['image'])
1318
1319             thumbnails = [{
1320                 'url': image.get('href'),
1321                 'width': int_or_none(image.get('width')),
1322                 'height': int_or_none(image.get('height')),
1323             } for image in images]
1324
1325             entries.append({
1326                 'id': video_id,
1327                 'title': title,
1328                 'thumbnails': thumbnails,
1329                 'duration': duration,
1330                 'timestamp': timestamp,
1331                 'formats': formats,
1332                 'subtitles': subtitles,
1333             })
1334
1335         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1336
1337
1338 class BBCCoUkArticleIE(InfoExtractor):
1339     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1340     IE_NAME = 'bbc.co.uk:article'
1341     IE_DESC = 'BBC articles'
1342
1343     _TEST = {
1344         'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1345         'info_dict': {
1346             'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1347             'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1348             'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1349         },
1350         'playlist_count': 4,
1351         'add_ie': ['BBCCoUk'],
1352     }
1353
1354     def _real_extract(self, url):
1355         playlist_id = self._match_id(url)
1356
1357         webpage = self._download_webpage(url, playlist_id)
1358
1359         title = self._og_search_title(webpage)
1360         description = self._og_search_description(webpage).strip()
1361
1362         entries = [self.url_result(programme_url) for programme_url in re.findall(
1363             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1364
1365         return self.playlist_result(entries, playlist_id, title, description)
1366
1367
1368 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1369     def _entries(self, webpage, url, playlist_id):
1370         single_page = 'page' in compat_urlparse.parse_qs(
1371             compat_urlparse.urlparse(url).query)
1372         for page_num in itertools.count(2):
1373             for video_id in re.findall(
1374                     self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1375                 yield self.url_result(
1376                     self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1377             if single_page:
1378                 return
1379             next_page = self._search_regex(
1380                 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1381                 webpage, 'next page url', default=None, group='url')
1382             if not next_page:
1383                 break
1384             webpage = self._download_webpage(
1385                 compat_urlparse.urljoin(url, next_page), playlist_id,
1386                 'Downloading page %d' % page_num, page_num)
1387
1388     def _real_extract(self, url):
1389         playlist_id = self._match_id(url)
1390
1391         webpage = self._download_webpage(url, playlist_id)
1392
1393         title, description = self._extract_title_and_description(webpage)
1394
1395         return self.playlist_result(
1396             self._entries(webpage, url, playlist_id),
1397             playlist_id, title, description)
1398
1399
1400 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1401     _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1402
1403     @staticmethod
1404     def _get_default(episode, key, default_key='default'):
1405         return try_get(episode, lambda x: x[key][default_key])
1406
1407     def _get_description(self, data):
1408         synopsis = data.get(self._DESCRIPTION_KEY) or {}
1409         return dict_get(synopsis, ('large', 'medium', 'small'))
1410
1411     def _fetch_page(self, programme_id, per_page, series_id, page):
1412         elements = self._get_elements(self._call_api(
1413             programme_id, per_page, page + 1, series_id))
1414         for element in elements:
1415             episode = self._get_episode(element)
1416             episode_id = episode.get('id')
1417             if not episode_id:
1418                 continue
1419             thumbnail = None
1420             image = self._get_episode_image(episode)
1421             if image:
1422                 thumbnail = image.replace('{recipe}', 'raw')
1423             category = self._get_default(episode, 'labels', 'category')
1424             yield {
1425                 '_type': 'url',
1426                 'id': episode_id,
1427                 'title': self._get_episode_field(episode, 'subtitle'),
1428                 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1429                 'thumbnail': thumbnail,
1430                 'description': self._get_description(episode),
1431                 'categories': [category] if category else None,
1432                 'series': self._get_episode_field(episode, 'title'),
1433                 'ie_key': BBCCoUkIE.ie_key(),
1434             }
1435
1436     def _real_extract(self, url):
1437         pid = self._match_id(url)
1438         qs = parse_qs(url)
1439         series_id = qs.get('seriesId', [None])[0]
1440         page = qs.get('page', [None])[0]
1441         per_page = 36 if page else self._PAGE_SIZE
1442         fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1443         entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1444         playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1445         return self.playlist_result(
1446             entries, pid, self._get_playlist_title(playlist_data),
1447             self._get_description(playlist_data))
1448
1449
1450 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1451     IE_NAME = 'bbc.co.uk:iplayer:episodes'
1452     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1453     _TESTS = [{
1454         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1455         'info_dict': {
1456             'id': 'b05rcz9v',
1457             'title': 'The Disappearance',
1458             'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1459         },
1460         'playlist_mincount': 8,
1461     }, {
1462         # all seasons
1463         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1464         'info_dict': {
1465             'id': 'b094m5t9',
1466             'title': 'Doctor Foster',
1467             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1468         },
1469         'playlist_mincount': 10,
1470     }, {
1471         # explicit season
1472         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1473         'info_dict': {
1474             'id': 'b094m5t9',
1475             'title': 'Doctor Foster',
1476             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1477         },
1478         'playlist_mincount': 5,
1479     }, {
1480         # all pages
1481         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1482         'info_dict': {
1483             'id': 'm0004c4v',
1484             'title': 'Beechgrove',
1485             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1486         },
1487         'playlist_mincount': 37,
1488     }, {
1489         # explicit page
1490         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1491         'info_dict': {
1492             'id': 'm0004c4v',
1493             'title': 'Beechgrove',
1494             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1495         },
1496         'playlist_mincount': 1,
1497     }]
1498     _PAGE_SIZE = 100
1499     _DESCRIPTION_KEY = 'synopsis'
1500
1501     def _get_episode_image(self, episode):
1502         return self._get_default(episode, 'image')
1503
1504     def _get_episode_field(self, episode, field):
1505         return self._get_default(episode, field)
1506
1507     @staticmethod
1508     def _get_elements(data):
1509         return data['entities']['results']
1510
1511     @staticmethod
1512     def _get_episode(element):
1513         return element.get('episode') or {}
1514
1515     def _call_api(self, pid, per_page, page=1, series_id=None):
1516         variables = {
1517             'id': pid,
1518             'page': page,
1519             'perPage': per_page,
1520         }
1521         if series_id:
1522             variables['sliceId'] = series_id
1523         return self._download_json(
1524             'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1525                 'Content-Type': 'application/json'
1526             }, data=json.dumps({
1527                 'id': '5692d93d5aac8d796a0305e895e61551',
1528                 'variables': variables,
1529             }).encode('utf-8'))['data']['programme']
1530
1531     @staticmethod
1532     def _get_playlist_data(data):
1533         return data
1534
1535     def _get_playlist_title(self, data):
1536         return self._get_default(data, 'title')
1537
1538
1539 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1540     IE_NAME = 'bbc.co.uk:iplayer:group'
1541     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1542     _TESTS = [{
1543         # Available for over a year unlike 30 days for most other programmes
1544         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1545         'info_dict': {
1546             'id': 'p02tcc32',
1547             'title': 'Bohemian Icons',
1548             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1549         },
1550         'playlist_mincount': 10,
1551     }, {
1552         # all pages
1553         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1554         'info_dict': {
1555             'id': 'p081d7j7',
1556             'title': 'Music in Scotland',
1557             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1558         },
1559         'playlist_mincount': 47,
1560     }, {
1561         # explicit page
1562         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1563         'info_dict': {
1564             'id': 'p081d7j7',
1565             'title': 'Music in Scotland',
1566             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1567         },
1568         'playlist_mincount': 11,
1569     }]
1570     _PAGE_SIZE = 200
1571     _DESCRIPTION_KEY = 'synopses'
1572
1573     def _get_episode_image(self, episode):
1574         return self._get_default(episode, 'images', 'standard')
1575
1576     def _get_episode_field(self, episode, field):
1577         return episode.get(field)
1578
1579     @staticmethod
1580     def _get_elements(data):
1581         return data['elements']
1582
1583     @staticmethod
1584     def _get_episode(element):
1585         return element
1586
1587     def _call_api(self, pid, per_page, page=1, series_id=None):
1588         return self._download_json(
1589             'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1590             pid, query={
1591                 'page': page,
1592                 'per_page': per_page,
1593             })['group_episodes']
1594
1595     @staticmethod
1596     def _get_playlist_data(data):
1597         return data['group']
1598
1599     def _get_playlist_title(self, data):
1600         return data.get('title')
1601
1602
1603 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1604     IE_NAME = 'bbc.co.uk:playlist'
1605     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1606     _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1607     _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1608     _TESTS = [{
1609         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1610         'info_dict': {
1611             'id': 'b05rcz9v',
1612             'title': 'The Disappearance - Clips - BBC Four',
1613             'description': 'French thriller serial about a missing teenager.',
1614         },
1615         'playlist_mincount': 7,
1616     }, {
1617         # multipage playlist, explicit page
1618         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1619         'info_dict': {
1620             'id': 'b00mfl7n',
1621             'title': 'Frozen Planet - Clips - BBC One',
1622             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1623         },
1624         'playlist_mincount': 24,
1625     }, {
1626         # multipage playlist, all pages
1627         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1628         'info_dict': {
1629             'id': 'b00mfl7n',
1630             'title': 'Frozen Planet - Clips - BBC One',
1631             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1632         },
1633         'playlist_mincount': 142,
1634     }, {
1635         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1636         'only_matching': True,
1637     }, {
1638         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1639         'only_matching': True,
1640     }, {
1641         'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1642         'only_matching': True,
1643     }]
1644
1645     def _extract_title_and_description(self, webpage):
1646         title = self._og_search_title(webpage, fatal=False)
1647         description = self._og_search_description(webpage)
1648         return title, description