yt_dlp/extractor/bbc.py

   1 import functools
   2 import itertools
   3 import json
   4 import re
   5 import xml.etree.ElementTree
   6
   7 from .common import InfoExtractor
   8 from ..compat import compat_str, compat_urlparse
   9 from ..networking.exceptions import HTTPError
  10 from ..utils import (
  11     ExtractorError,
  12     OnDemandPagedList,
  13     clean_html,
  14     dict_get,
  15     float_or_none,
  16     get_element_by_class,
  17     int_or_none,
  18     join_nonempty,
  19     js_to_json,
  20     parse_duration,
  21     parse_iso8601,
  22     parse_qs,
  23     strip_or_none,
  24     traverse_obj,
  25     try_get,
  26     unescapeHTML,
  27     unified_timestamp,
  28     url_or_none,
  29     urlencode_postdata,
  30     urljoin,
  31 )
  32
  33
  34 class BBCCoUkIE(InfoExtractor):
  35     IE_NAME = 'bbc.co.uk'
  36     IE_DESC = 'BBC iPlayer'
  37     _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  38     _VALID_URL = r'''(?x)
  39                     https?://
  40                         (?:www\.)?bbc\.co\.uk/
  41                         (?:
  42                             programmes/(?!articles/)|
  43                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  44                             music/(?:clips|audiovideo/popular)[/#]|
  45                             radio/player/|
  46                             events/[^/]+/play/[^/]+/
  47                         )
  48                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  49                     ''' % _ID_REGEX
  50     _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  51
  52     _LOGIN_URL = 'https://account.bbc.com/signin'
  53     _NETRC_MACHINE = 'bbc'
  54
  55     _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  56     _MEDIA_SETS = [
  57         # Provides HQ HLS streams with even better quality that pc mediaset but fails
  58         # with geolocation in some cases when it's even not geo restricted at all (e.g.
  59         # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  60         'iptv-all',
  61         'pc',
  62     ]
  63
  64     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  65
  66     _TESTS = [
  67         {
  68             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  69             'info_dict': {
  70                 'id': 'b039d07m',
  71                 'ext': 'flv',
  72                 'title': 'Kaleidoscope, Leonard Cohen',
  73                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  74             },
  75             'params': {
  76                 # rtmp download
  77                 'skip_download': True,
  78             }
  79         },
  80         {
  81             'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  82             'info_dict': {
  83                 'id': 'b00yng1d',
  84                 'ext': 'flv',
  85                 'title': 'The Man in Black: Series 3: The Printed Name',
  86                 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  87                 'duration': 1800,
  88             },
  89             'params': {
  90                 # rtmp download
  91                 'skip_download': True,
  92             },
  93             'skip': 'Episode is no longer available on BBC iPlayer Radio',
  94         },
  95         {
  96             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  97             'info_dict': {
  98                 'id': 'b00yng1d',
  99                 'ext': 'flv',
 100                 'title': 'The Voice UK: Series 3: Blind Auditions 5',
 101                 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
 102                 'duration': 5100,
 103             },
 104             'params': {
 105                 # rtmp download
 106                 'skip_download': True,
 107             },
 108             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 109         },
 110         {
 111             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
 112             'info_dict': {
 113                 'id': 'b03k3pb7',
 114                 'ext': 'flv',
 115                 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
 116                 'description': '2. Invasion',
 117                 'duration': 3600,
 118             },
 119             'params': {
 120                 # rtmp download
 121                 'skip_download': True,
 122             },
 123             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 124         }, {
 125             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
 126             'info_dict': {
 127                 'id': 'b04v209v',
 128                 'ext': 'flv',
 129                 'title': 'Pete Tong, The Essential New Tune Special',
 130                 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
 131                 'duration': 10800,
 132             },
 133             'params': {
 134                 # rtmp download
 135                 'skip_download': True,
 136             },
 137             'skip': 'Episode is no longer available on BBC iPlayer Radio',
 138         }, {
 139             'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
 140             'note': 'Audio',
 141             'info_dict': {
 142                 'id': 'p022h44j',
 143                 'ext': 'flv',
 144                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
 145                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
 146                 'duration': 227,
 147             },
 148             'params': {
 149                 # rtmp download
 150                 'skip_download': True,
 151             }
 152         }, {
 153             'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
 154             'note': 'Video',
 155             'info_dict': {
 156                 'id': 'p025c103',
 157                 'ext': 'flv',
 158                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
 159                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
 160                 'duration': 226,
 161             },
 162             'params': {
 163                 # rtmp download
 164                 'skip_download': True,
 165             }
 166         }, {
 167             'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
 168             'info_dict': {
 169                 'id': 'p02n76xf',
 170                 'ext': 'flv',
 171                 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
 172                 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
 173                 'duration': 3540,
 174             },
 175             'params': {
 176                 # rtmp download
 177                 'skip_download': True,
 178             },
 179             'skip': 'geolocation',
 180         }, {
 181             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
 182             'info_dict': {
 183                 'id': 'b05zmgw1',
 184                 'ext': 'flv',
 185                 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
 186                 'title': 'Royal Academy Summer Exhibition',
 187                 'duration': 3540,
 188             },
 189             'params': {
 190                 # rtmp download
 191                 'skip_download': True,
 192             },
 193             'skip': 'geolocation',
 194         }, {
 195             # iptv-all mediaset fails with geolocation however there is no geo restriction
 196             # for this programme at all
 197             'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
 198             'info_dict': {
 199                 'id': 'b06rkms3',
 200                 'ext': 'flv',
 201                 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
 202                 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
 203             },
 204             'params': {
 205                 # rtmp download
 206                 'skip_download': True,
 207             },
 208             'skip': 'Now it\'s really geo-restricted',
 209         }, {
 210             # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
 211             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
 212             'info_dict': {
 213                 'id': 'p028bfkj',
 214                 'ext': 'flv',
 215                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 216                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 217             },
 218             'params': {
 219                 # rtmp download
 220                 'skip_download': True,
 221             },
 222         }, {
 223             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
 224             'only_matching': True,
 225         }, {
 226             'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
 227             'only_matching': True,
 228         }, {
 229             'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
 230             'only_matching': True,
 231         }, {
 232             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
 233             'only_matching': True,
 234         }, {
 235             'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
 236             'only_matching': True,
 237         }, {
 238             'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
 239             'only_matching': True,
 240         }, {
 241             'url': 'https://www.bbc.co.uk/programmes/m00005xn',
 242             'only_matching': True,
 243         }, {
 244             'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
 245             'only_matching': True,
 246         }]
 247
 248     def _perform_login(self, username, password):
 249         login_page = self._download_webpage(
 250             self._LOGIN_URL, None, 'Downloading signin page')
 251
 252         login_form = self._hidden_inputs(login_page)
 253
 254         login_form.update({
 255             'username': username,
 256             'password': password,
 257         })
 258
 259         post_url = urljoin(self._LOGIN_URL, self._search_regex(
 260             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
 261             'post url', default=self._LOGIN_URL, group='url'))
 262
 263         response, urlh = self._download_webpage_handle(
 264             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
 265             headers={'Referer': self._LOGIN_URL})
 266
 267         if self._LOGIN_URL in urlh.url:
 268             error = clean_html(get_element_by_class('form-message', response))
 269             if error:
 270                 raise ExtractorError(
 271                     'Unable to login: %s' % error, expected=True)
 272             raise ExtractorError('Unable to log in')
 273
 274     class MediaSelectionError(Exception):
 275         def __init__(self, id):
 276             self.id = id
 277
 278     def _extract_asx_playlist(self, connection, programme_id):
 279         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
 280         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 281
 282     def _extract_items(self, playlist):
 283         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 284
 285     def _extract_medias(self, media_selection):
 286         error = media_selection.get('result')
 287         if error:
 288             raise BBCCoUkIE.MediaSelectionError(error)
 289         return media_selection.get('media') or []
 290
 291     def _extract_connections(self, media):
 292         return media.get('connection') or []
 293
 294     def _get_subtitles(self, media, programme_id):
 295         subtitles = {}
 296         for connection in self._extract_connections(media):
 297             cc_url = url_or_none(connection.get('href'))
 298             if not cc_url:
 299                 continue
 300             captions = self._download_xml(
 301                 cc_url, programme_id, 'Downloading captions', fatal=False)
 302             if not isinstance(captions, xml.etree.ElementTree.Element):
 303                 continue
 304             subtitles['en'] = [
 305                 {
 306                     'url': connection.get('href'),
 307                     'ext': 'ttml',
 308                 },
 309             ]
 310             break
 311         return subtitles
 312
 313     def _raise_extractor_error(self, media_selection_error):
 314         raise ExtractorError(
 315             '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
 316             expected=True)
 317
 318     def _download_media_selector(self, programme_id):
 319         last_exception = None
 320         for media_set in self._MEDIA_SETS:
 321             try:
 322                 return self._download_media_selector_url(
 323                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
 324             except BBCCoUkIE.MediaSelectionError as e:
 325                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
 326                     last_exception = e
 327                     continue
 328                 self._raise_extractor_error(e)
 329         self._raise_extractor_error(last_exception)
 330
 331     def _download_media_selector_url(self, url, programme_id=None):
 332         media_selection = self._download_json(
 333             url, programme_id, 'Downloading media selection JSON',
 334             expected_status=(403, 404))
 335         return self._process_media_selector(media_selection, programme_id)
 336
 337     def _process_media_selector(self, media_selection, programme_id):
 338         formats = []
 339         subtitles = None
 340         urls = []
 341
 342         for media in self._extract_medias(media_selection):
 343             kind = media.get('kind')
 344             if kind in ('video', 'audio'):
 345                 bitrate = int_or_none(media.get('bitrate'))
 346                 encoding = media.get('encoding')
 347                 width = int_or_none(media.get('width'))
 348                 height = int_or_none(media.get('height'))
 349                 file_size = int_or_none(media.get('media_file_size'))
 350                 for connection in self._extract_connections(media):
 351                     href = connection.get('href')
 352                     if href in urls:
 353                         continue
 354                     if href:
 355                         urls.append(href)
 356                     conn_kind = connection.get('kind')
 357                     protocol = connection.get('protocol')
 358                     supplier = connection.get('supplier')
 359                     transfer_format = connection.get('transferFormat')
 360                     format_id = supplier or conn_kind or protocol
 361                     # ASX playlist
 362                     if supplier == 'asx':
 363                         for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
 364                             formats.append({
 365                                 'url': ref,
 366                                 'format_id': 'ref%s_%s' % (i, format_id),
 367                             })
 368                     elif transfer_format == 'dash':
 369                         formats.extend(self._extract_mpd_formats(
 370                             href, programme_id, mpd_id=format_id, fatal=False))
 371                     elif transfer_format == 'hls':
 372                         # TODO: let expected_status be passed into _extract_xxx_formats() instead
 373                         try:
 374                             fmts = self._extract_m3u8_formats(
 375                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
 376                                 m3u8_id=format_id, fatal=False)
 377                         except ExtractorError as e:
 378                             if not (isinstance(e.exc_info[1], HTTPError)
 379                                     and e.exc_info[1].status in (403, 404)):
 380                                 raise
 381                             fmts = []
 382                         formats.extend(fmts)
 383                     elif transfer_format == 'hds':
 384                         formats.extend(self._extract_f4m_formats(
 385                             href, programme_id, f4m_id=format_id, fatal=False))
 386                     else:
 387                         if not supplier and bitrate:
 388                             format_id += '-%d' % bitrate
 389                         fmt = {
 390                             'format_id': format_id,
 391                             'filesize': file_size,
 392                         }
 393                         if kind == 'video':
 394                             fmt.update({
 395                                 'width': width,
 396                                 'height': height,
 397                                 'tbr': bitrate,
 398                                 'vcodec': encoding,
 399                             })
 400                         else:
 401                             fmt.update({
 402                                 'abr': bitrate,
 403                                 'acodec': encoding,
 404                                 'vcodec': 'none',
 405                             })
 406                         if protocol in ('http', 'https'):
 407                             # Direct link
 408                             fmt.update({
 409                                 'url': href,
 410                             })
 411                         elif protocol == 'rtmp':
 412                             application = connection.get('application', 'ondemand')
 413                             auth_string = connection.get('authString')
 414                             identifier = connection.get('identifier')
 415                             server = connection.get('server')
 416                             fmt.update({
 417                                 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
 418                                 'play_path': identifier,
 419                                 'app': '%s?%s' % (application, auth_string),
 420                                 'page_url': 'http://www.bbc.co.uk',
 421                                 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
 422                                 'rtmp_live': False,
 423                                 'ext': 'flv',
 424                             })
 425                         else:
 426                             continue
 427                         formats.append(fmt)
 428             elif kind == 'captions':
 429                 subtitles = self.extract_subtitles(media, programme_id)
 430         return formats, subtitles
 431
 432     def _download_playlist(self, playlist_id):
 433         try:
 434             playlist = self._download_json(
 435                 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
 436                 playlist_id, 'Downloading playlist JSON')
 437             formats = []
 438             subtitles = {}
 439
 440             for version in playlist.get('allAvailableVersions', []):
 441                 smp_config = version['smpConfig']
 442                 title = smp_config['title']
 443                 description = smp_config['summary']
 444                 for item in smp_config['items']:
 445                     kind = item['kind']
 446                     if kind not in ('programme', 'radioProgramme'):
 447                         continue
 448                     programme_id = item.get('vpid')
 449                     duration = int_or_none(item.get('duration'))
 450                     version_formats, version_subtitles = self._download_media_selector(programme_id)
 451                     types = version['types']
 452                     for f in version_formats:
 453                         f['format_note'] = ', '.join(types)
 454                         if any('AudioDescribed' in x for x in types):
 455                             f['language_preference'] = -10
 456                     formats += version_formats
 457                     for tag, subformats in (version_subtitles or {}).items():
 458                         subtitles.setdefault(tag, []).extend(subformats)
 459
 460             return programme_id, title, description, duration, formats, subtitles
 461         except ExtractorError as ee:
 462             if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
 463                 raise
 464
 465         # fallback to legacy playlist
 466         return self._process_legacy_playlist(playlist_id)
 467
 468     def _process_legacy_playlist_url(self, url, display_id):
 469         playlist = self._download_legacy_playlist_url(url, display_id)
 470         return self._extract_from_legacy_playlist(playlist, display_id)
 471
 472     def _process_legacy_playlist(self, playlist_id):
 473         return self._process_legacy_playlist_url(
 474             'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
 475
 476     def _download_legacy_playlist_url(self, url, playlist_id=None):
 477         return self._download_xml(
 478             url, playlist_id, 'Downloading legacy playlist XML')
 479
 480     def _extract_from_legacy_playlist(self, playlist, playlist_id):
 481         no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
 482         if no_items is not None:
 483             reason = no_items.get('reason')
 484             if reason == 'preAvailability':
 485                 msg = 'Episode %s is not yet available' % playlist_id
 486             elif reason == 'postAvailability':
 487                 msg = 'Episode %s is no longer available' % playlist_id
 488             elif reason == 'noMedia':
 489                 msg = 'Episode %s is not currently available' % playlist_id
 490             else:
 491                 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
 492             raise ExtractorError(msg, expected=True)
 493
 494         for item in self._extract_items(playlist):
 495             kind = item.get('kind')
 496             if kind not in ('programme', 'radioProgramme'):
 497                 continue
 498             title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
 499             description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
 500             description = description_el.text if description_el is not None else None
 501
 502             def get_programme_id(item):
 503                 def get_from_attributes(item):
 504                     for p in ('identifier', 'group'):
 505                         value = item.get(p)
 506                         if value and re.match(r'^[pb][\da-z]{7}$', value):
 507                             return value
 508                 get_from_attributes(item)
 509                 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
 510                 if mediator is not None:
 511                     return get_from_attributes(mediator)
 512
 513             programme_id = get_programme_id(item)
 514             duration = int_or_none(item.get('duration'))
 515
 516             if programme_id:
 517                 formats, subtitles = self._download_media_selector(programme_id)
 518             else:
 519                 formats, subtitles = self._process_media_selector(item, playlist_id)
 520                 programme_id = playlist_id
 521
 522         return programme_id, title, description, duration, formats, subtitles
 523
 524     def _real_extract(self, url):
 525         group_id = self._match_id(url)
 526
 527         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 528
 529         error = self._search_regex(
 530             r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
 531             webpage, 'error', default=None)
 532         if error:
 533             raise ExtractorError(error, expected=True)
 534
 535         programme_id = None
 536         duration = None
 537
 538         tviplayer = self._search_regex(
 539             r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
 540             webpage, 'player', default=None)
 541
 542         if tviplayer:
 543             player = self._parse_json(tviplayer, group_id).get('player', {})
 544             duration = int_or_none(player.get('duration'))
 545             programme_id = player.get('vpid')
 546
 547         if not programme_id:
 548             programme_id = self._search_regex(
 549                 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 550
 551         if programme_id:
 552             formats, subtitles = self._download_media_selector(programme_id)
 553             title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 554                 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
 555                  r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
 556             description = self._search_regex(
 557                 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
 558                  r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
 559                 webpage, 'description', default=None)
 560             if not description:
 561                 description = self._html_search_meta('description', webpage)
 562         else:
 563             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 564
 565         return {
 566             'id': programme_id,
 567             'title': title,
 568             'description': description,
 569             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 570             'duration': duration,
 571             'formats': formats,
 572             'subtitles': subtitles,
 573         }
 574
 575
 576 class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
 577     IE_NAME = 'bbc'
 578     IE_DESC = 'BBC'
 579     _VALID_URL = r'''(?x)
 580         https?://(?:www\.)?(?:
 581             bbc\.(?:com|co\.uk)|
 582             bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
 583             bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
 584         )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
 585
 586     _MEDIA_SETS = [
 587         'pc',
 588         'mobile-tablet-main',
 589     ]
 590
 591     _TESTS = [{
 592         # article with multiple videos embedded with data-playable containing vpids
 593         'url': 'http://www.bbc.com/news/world-europe-32668511',
 594         'info_dict': {
 595             'id': 'world-europe-32668511',
 596             'title': 'Russia stages massive WW2 parade',
 597             'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
 598         },
 599         'playlist_count': 2,
 600     }, {
 601         # article with multiple videos embedded with data-playable (more videos)
 602         'url': 'http://www.bbc.com/news/business-28299555',
 603         'info_dict': {
 604             'id': 'business-28299555',
 605             'title': 'Farnborough Airshow: Video highlights',
 606             'description': 'BBC reports and video highlights at the Farnborough Airshow.',
 607         },
 608         'playlist_count': 9,
 609         'skip': 'Save time',
 610     }, {
 611         # article with multiple videos embedded with `new SMP()`
 612         # broken
 613         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
 614         'info_dict': {
 615             'id': '3662a707-0af9-3149-963f-47bea720b460',
 616             'title': 'BUGGER',
 617         },
 618         'playlist_count': 18,
 619     }, {
 620         # single video embedded with data-playable containing vpid
 621         'url': 'http://www.bbc.com/news/world-europe-32041533',
 622         'info_dict': {
 623             'id': 'p02mprgb',
 624             'ext': 'mp4',
 625             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 626             'description': 'md5:2868290467291b37feda7863f7a83f54',
 627             'duration': 47,
 628             'timestamp': 1427219242,
 629             'upload_date': '20150324',
 630         },
 631         'params': {
 632             # rtmp download
 633             'skip_download': True,
 634         }
 635     }, {
 636         # article with single video embedded with data-playable containing XML playlist
 637         # with direct video links as progressiveDownloadUrl (for now these are extracted)
 638         # and playlist with f4m and m3u8 as streamingUrl
 639         'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 640         'info_dict': {
 641             'id': '150615_telabyad_kentin_cogu',
 642             'ext': 'mp4',
 643             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
 644             'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
 645             'timestamp': 1434397334,
 646             'upload_date': '20150615',
 647         },
 648         'params': {
 649             'skip_download': True,
 650         }
 651     }, {
 652         # single video embedded with data-playable containing XML playlists (regional section)
 653         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 654         'info_dict': {
 655             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
 656             'ext': 'mp4',
 657             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
 658             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
 659             'timestamp': 1434713142,
 660             'upload_date': '20150619',
 661         },
 662         'params': {
 663             'skip_download': True,
 664         }
 665     }, {
 666         # single video from video playlist embedded with vxp-playlist-data JSON
 667         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
 668         'info_dict': {
 669             'id': 'p02w6qjc',
 670             'ext': 'mp4',
 671             'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 672             'duration': 56,
 673             'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 674         },
 675         'params': {
 676             'skip_download': True,
 677         }
 678     }, {
 679         # single video story with digitalData
 680         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
 681         'info_dict': {
 682             'id': 'p02q6gc4',
 683             'ext': 'flv',
 684             'title': 'Sri Lanka’s spicy secret',
 685             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
 686             'timestamp': 1437674293,
 687             'upload_date': '20150723',
 688         },
 689         'params': {
 690             # rtmp download
 691             'skip_download': True,
 692         }
 693     }, {
 694         # single video story without digitalData
 695         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
 696         'info_dict': {
 697             'id': 'p018zqqg',
 698             'ext': 'mp4',
 699             'title': 'Hyundai Santa Fe Sport: Rock star',
 700             'description': 'md5:b042a26142c4154a6e472933cf20793d',
 701             'timestamp': 1415867444,
 702             'upload_date': '20141113',
 703         },
 704         'params': {
 705             # rtmp download
 706             'skip_download': True,
 707         }
 708     }, {
 709         # single video embedded with Morph
 710         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
 711         'info_dict': {
 712             'id': 'p041vhd0',
 713             'ext': 'mp4',
 714             'title': "Nigeria v Japan - Men's First Round",
 715             'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
 716             'duration': 7980,
 717             'uploader': 'BBC Sport',
 718             'uploader_id': 'bbc_sport',
 719         },
 720         'params': {
 721             # m3u8 download
 722             'skip_download': True,
 723         },
 724         'skip': 'Georestricted to UK',
 725     }, {
 726         # single video with playlist.sxml URL in playlist param
 727         'url': 'http://www.bbc.com/sport/0/football/33653409',
 728         'info_dict': {
 729             'id': 'p02xycnp',
 730             'ext': 'mp4',
 731             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
 732             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
 733             'duration': 140,
 734         },
 735         'params': {
 736             # rtmp download
 737             'skip_download': True,
 738         }
 739     }, {
 740         # article with multiple videos embedded with playlist.sxml in playlist param
 741         'url': 'http://www.bbc.com/sport/0/football/34475836',
 742         'info_dict': {
 743             'id': '34475836',
 744             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
 745             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
 746         },
 747         'playlist_count': 3,
 748     }, {
 749         # school report article with single video
 750         'url': 'http://www.bbc.co.uk/schoolreport/35744779',
 751         'info_dict': {
 752             'id': '35744779',
 753             'title': 'School which breaks down barriers in Jerusalem',
 754         },
 755         'playlist_count': 1,
 756     }, {
 757         # single video with playlist URL from weather section
 758         'url': 'http://www.bbc.com/weather/features/33601775',
 759         'only_matching': True,
 760     }, {
 761         # custom redirection to www.bbc.com
 762         # also, video with window.__INITIAL_DATA__
 763         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
 764         'info_dict': {
 765             'id': 'p02xzws1',
 766             'ext': 'mp4',
 767             'title': "Pluto may have 'nitrogen glaciers'",
 768             'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
 769             'thumbnail': r're:https?://.+/.+\.jpg',
 770             'timestamp': 1437785037,
 771             'upload_date': '20150725',
 772         },
 773     }, {
 774         # video with window.__INITIAL_DATA__ and value as JSON string
 775         'url': 'https://www.bbc.com/news/av/world-europe-59468682',
 776         'info_dict': {
 777             'id': 'p0b71qth',
 778             'ext': 'mp4',
 779             'title': 'Why France is making this woman a national hero',
 780             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
 781             'thumbnail': r're:https?://.+/.+\.jpg',
 782             'timestamp': 1638230731,
 783             'upload_date': '20211130',
 784         },
 785     }, {
 786         # single video article embedded with data-media-vpid
 787         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
 788         'only_matching': True,
 789     }, {
 790         # bbcthreeConfig
 791         'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
 792         'info_dict': {
 793             'id': 'p06556y7',
 794             'ext': 'mp4',
 795             'title': 'Things Not To Say to people that live on council estates',
 796             'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
 797             'duration': 360,
 798             'thumbnail': r're:https?://.+/.+\.jpg',
 799         },
 800     }, {
 801         # window.__PRELOADED_STATE__
 802         'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
 803         'info_dict': {
 804             'id': 'b0b9z4vz',
 805             'ext': 'mp4',
 806             'title': 'Prom 6: An American in Paris and Turangalila',
 807             'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
 808             'uploader': 'Radio 3',
 809             'uploader_id': 'bbc_radio_three',
 810         },
 811     }, {
 812         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
 813         'info_dict': {
 814             'id': 'p06w9tws',
 815             'ext': 'mp4',
 816             'title': 'md5:2fabf12a726603193a2879a055f72514',
 817             'description': 'Learn English words and phrases from this story',
 818         },
 819         'add_ie': [BBCCoUkIE.ie_key()],
 820     }, {
 821         # BBC Reel
 822         'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
 823         'info_dict': {
 824             'id': 'p07c6sb9',
 825             'ext': 'mp4',
 826             'title': 'How positive thinking is harming your happiness',
 827             'alt_title': 'The downsides of positive thinking',
 828             'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
 829             'duration': 235,
 830             'thumbnail': r're:https?://.+/p07c9dsr.jpg',
 831             'upload_date': '20190604',
 832             'categories': ['Psychology'],
 833         },
 834     }, {
 835         # BBC Sounds
 836         'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
 837         'info_dict': {
 838             'id': 'm001q789',
 839             'ext': 'mp4',
 840             'title': 'The Night Tracks Mix - Music for the darkling hour',
 841             'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
 842             'chapters': 'count:8',
 843             'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
 844             'uploader': 'Radio 3',
 845             'duration': 1800,
 846             'uploader_id': 'bbc_radio_three',
 847         },
 848     }, {  # onion routes
 849         'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
 850         'only_matching': True,
 851     }, {
 852         'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
 853         'only_matching': True,
 854     }]
 855
 856     @classmethod
 857     def suitable(cls, url):
 858         EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
 859         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
 860                 else super(BBCIE, cls).suitable(url))
 861
 862     def _extract_from_media_meta(self, media_meta, video_id):
 863         # Direct links to media in media metadata (e.g.
 864         # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 865         # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
 866         source_files = media_meta.get('sourceFiles')
 867         if source_files:
 868             return [{
 869                 'url': f['url'],
 870                 'format_id': format_id,
 871                 'ext': f.get('encoding'),
 872                 'tbr': float_or_none(f.get('bitrate'), 1000),
 873                 'filesize': int_or_none(f.get('filesize')),
 874             } for format_id, f in source_files.items() if f.get('url')], []
 875
 876         programme_id = media_meta.get('externalId')
 877         if programme_id:
 878             return self._download_media_selector(programme_id)
 879
 880         # Process playlist.sxml as legacy playlist
 881         href = media_meta.get('href')
 882         if href:
 883             playlist = self._download_legacy_playlist_url(href)
 884             _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
 885             return formats, subtitles
 886
 887         return [], []
 888
 889     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
 890         programme_id, title, description, duration, formats, subtitles = \
 891             self._process_legacy_playlist_url(url, playlist_id)
 892         return {
 893             'id': programme_id,
 894             'title': title,
 895             'description': description,
 896             'duration': duration,
 897             'timestamp': timestamp,
 898             'formats': formats,
 899             'subtitles': subtitles,
 900         }
 901
 902     def _real_extract(self, url):
 903         playlist_id = self._match_id(url)
 904
 905         webpage = self._download_webpage(url, playlist_id)
 906
 907         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
 908         timestamp = json_ld_info.get('timestamp')
 909
 910         playlist_title = json_ld_info.get('title') or re.sub(
 911             r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
 912
 913         playlist_description = json_ld_info.get(
 914             'description') or self._og_search_description(webpage, default=None)
 915
 916         if not timestamp:
 917             timestamp = parse_iso8601(self._search_regex(
 918                 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
 919                  r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
 920                  r'"datePublished":\s*"([^"]+)'],
 921                 webpage, 'date', default=None))
 922
 923         entries = []
 924
 925         # article with multiple videos embedded with playlist.sxml (e.g.
 926         # http://www.bbc.com/sport/0/football/34475836)
 927         playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
 928         playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
 929         if playlists:
 930             entries = [
 931                 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
 932                 for playlist_url in playlists]
 933
 934         # news article with multiple videos embedded with data-playable
 935         data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
 936         if data_playables:
 937             for _, data_playable_json in data_playables:
 938                 data_playable = self._parse_json(
 939                     unescapeHTML(data_playable_json), playlist_id, fatal=False)
 940                 if not data_playable:
 941                     continue
 942                 settings = data_playable.get('settings', {})
 943                 if settings:
 944                     # data-playable with video vpid in settings.playlistObject.items (e.g.
 945                     # http://www.bbc.com/news/world-us-canada-34473351)
 946                     playlist_object = settings.get('playlistObject', {})
 947                     if playlist_object:
 948                         items = playlist_object.get('items')
 949                         if items and isinstance(items, list):
 950                             title = playlist_object['title']
 951                             description = playlist_object.get('summary')
 952                             duration = int_or_none(items[0].get('duration'))
 953                             programme_id = items[0].get('vpid')
 954                             formats, subtitles = self._download_media_selector(programme_id)
 955                             entries.append({
 956                                 'id': programme_id,
 957                                 'title': title,
 958                                 'description': description,
 959                                 'timestamp': timestamp,
 960                                 'duration': duration,
 961                                 'formats': formats,
 962                                 'subtitles': subtitles,
 963                             })
 964                     else:
 965                         # data-playable without vpid but with a playlist.sxml URLs
 966                         # in otherSettings.playlist (e.g.
 967                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
 968                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
 969                         if playlist:
 970                             entry = None
 971                             for key in ('streaming', 'progressiveDownload'):
 972                                 playlist_url = playlist.get('%sUrl' % key)
 973                                 if not playlist_url:
 974                                     continue
 975                                 try:
 976                                     info = self._extract_from_playlist_sxml(
 977                                         playlist_url, playlist_id, timestamp)
 978                                     if not entry:
 979                                         entry = info
 980                                     else:
 981                                         entry['title'] = info['title']
 982                                         entry['formats'].extend(info['formats'])
 983                                 except ExtractorError as e:
 984                                     # Some playlist URL may fail with 500, at the same time
 985                                     # the other one may work fine (e.g.
 986                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 987                                     if isinstance(e.cause, HTTPError) and e.cause.status == 500:
 988                                         continue
 989                                     raise
 990                             if entry:
 991                                 entries.append(entry)
 992
 993         if entries:
 994             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
 995
 996         # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
 997         group_id = self._search_regex(
 998             r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
 999             webpage, 'group id', default=None)
1000         if group_id:
1001             return self.url_result(
1002                 'https://www.bbc.co.uk/programmes/%s' % group_id,
1003                 ie=BBCCoUkIE.ie_key())
1004
1005         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1006         programme_id = self._search_regex(
1007             [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1008              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1009              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1010             webpage, 'vpid', default=None)
1011
1012         if programme_id:
1013             formats, subtitles = self._download_media_selector(programme_id)
1014             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1015             digital_data = self._parse_json(
1016                 self._search_regex(
1017                     r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1018                 programme_id, fatal=False)
1019             page_info = digital_data.get('page', {}).get('pageInfo', {})
1020             title = page_info.get('pageName') or self._og_search_title(webpage)
1021             description = page_info.get('description') or self._og_search_description(webpage)
1022             timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1023             return {
1024                 'id': programme_id,
1025                 'title': title,
1026                 'description': description,
1027                 'timestamp': timestamp,
1028                 'formats': formats,
1029                 'subtitles': subtitles,
1030             }
1031
1032         # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1033         initial_data = self._parse_json(self._html_search_regex(
1034             r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1035             webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1036         if initial_data:
1037             init_data = try_get(
1038                 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1039             smp_data = init_data.get('smpData') or {}
1040             clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1041             version_id = clip_data.get('versionID')
1042             if version_id:
1043                 title = smp_data['title']
1044                 formats, subtitles = self._download_media_selector(version_id)
1045                 image_url = smp_data.get('holdingImageURL')
1046                 display_date = init_data.get('displayDate')
1047                 topic_title = init_data.get('topicTitle')
1048
1049                 return {
1050                     'id': version_id,
1051                     'title': title,
1052                     'formats': formats,
1053                     'alt_title': init_data.get('shortTitle'),
1054                     'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1055                     'description': smp_data.get('summary') or init_data.get('shortSummary'),
1056                     'upload_date': display_date.replace('-', '') if display_date else None,
1057                     'subtitles': subtitles,
1058                     'duration': int_or_none(clip_data.get('duration')),
1059                     'categories': [topic_title] if topic_title else None,
1060                 }
1061
1062         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1063         # There are several setPayload calls may be present but the video
1064         # seems to be always related to the first one
1065         morph_payload = self._parse_json(
1066             self._search_regex(
1067                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1068                 webpage, 'morph payload', default='{}'),
1069             playlist_id, fatal=False)
1070         if morph_payload:
1071             components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1072             for component in components:
1073                 if not isinstance(component, dict):
1074                     continue
1075                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1076                 if not lead_media:
1077                     continue
1078                 identifiers = lead_media.get('identifiers')
1079                 if not identifiers or not isinstance(identifiers, dict):
1080                     continue
1081                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1082                 if not programme_id:
1083                     continue
1084                 title = lead_media.get('title') or self._og_search_title(webpage)
1085                 formats, subtitles = self._download_media_selector(programme_id)
1086                 description = lead_media.get('summary')
1087                 uploader = lead_media.get('masterBrand')
1088                 uploader_id = lead_media.get('mid')
1089                 duration = None
1090                 duration_d = lead_media.get('duration')
1091                 if isinstance(duration_d, dict):
1092                     duration = parse_duration(dict_get(
1093                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1094                 return {
1095                     'id': programme_id,
1096                     'title': title,
1097                     'description': description,
1098                     'duration': duration,
1099                     'uploader': uploader,
1100                     'uploader_id': uploader_id,
1101                     'formats': formats,
1102                     'subtitles': subtitles,
1103                 }
1104
1105         preload_state = self._parse_json(self._search_regex(
1106             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1107             'preload state', default='{}'), playlist_id, fatal=False)
1108         if preload_state:
1109             current_programme = preload_state.get('programmes', {}).get('current') or {}
1110             programme_id = current_programme.get('id')
1111             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1112                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1113                 formats, subtitles = self._download_media_selector(programme_id)
1114                 synopses = current_programme.get('synopses') or {}
1115                 network = current_programme.get('network') or {}
1116                 duration = int_or_none(
1117                     current_programme.get('duration', {}).get('value'))
1118                 thumbnail = None
1119                 image_url = current_programme.get('image_url')
1120                 if image_url:
1121                     thumbnail = image_url.replace('{recipe}', 'raw')
1122                 return {
1123                     'id': programme_id,
1124                     'title': title,
1125                     'description': dict_get(synopses, ('long', 'medium', 'short')),
1126                     'thumbnail': thumbnail,
1127                     'duration': duration,
1128                     'uploader': network.get('short_title'),
1129                     'uploader_id': network.get('id'),
1130                     'formats': formats,
1131                     'subtitles': subtitles,
1132                     'chapters': traverse_obj(preload_state, (
1133                         'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
1134                             'title': ('titles', {lambda x: join_nonempty(
1135                                 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
1136                             'start_time': ('offset', 'start', {float_or_none}),
1137                             'end_time': ('offset', 'end', {float_or_none}),
1138                         })) or None,
1139                 }
1140
1141         bbc3_config = self._parse_json(
1142             self._search_regex(
1143                 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1144                 'bbcthree config', default='{}'),
1145             playlist_id, transform_source=js_to_json, fatal=False) or {}
1146         payload = bbc3_config.get('payload') or {}
1147         if payload:
1148             clip = payload.get('currentClip') or {}
1149             clip_vpid = clip.get('vpid')
1150             clip_title = clip.get('title')
1151             if clip_vpid and clip_title:
1152                 formats, subtitles = self._download_media_selector(clip_vpid)
1153                 return {
1154                     'id': clip_vpid,
1155                     'title': clip_title,
1156                     'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1157                     'description': clip.get('description'),
1158                     'duration': parse_duration(clip.get('duration')),
1159                     'formats': formats,
1160                     'subtitles': subtitles,
1161                 }
1162             bbc3_playlist = try_get(
1163                 payload, lambda x: x['content']['bbcMedia']['playlist'],
1164                 dict)
1165             if bbc3_playlist:
1166                 playlist_title = bbc3_playlist.get('title') or playlist_title
1167                 thumbnail = bbc3_playlist.get('holdingImageURL')
1168                 entries = []
1169                 for bbc3_item in bbc3_playlist['items']:
1170                     programme_id = bbc3_item.get('versionID')
1171                     if not programme_id:
1172                         continue
1173                     formats, subtitles = self._download_media_selector(programme_id)
1174                     entries.append({
1175                         'id': programme_id,
1176                         'title': playlist_title,
1177                         'thumbnail': thumbnail,
1178                         'timestamp': timestamp,
1179                         'formats': formats,
1180                         'subtitles': subtitles,
1181                     })
1182                 return self.playlist_result(
1183                     entries, playlist_id, playlist_title, playlist_description)
1184
1185         initial_data = self._search_regex(
1186             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1187             'quoted preload state', default=None)
1188         if initial_data is None:
1189             initial_data = self._search_regex(
1190                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1191                 'preload state', default={})
1192         else:
1193             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1194         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1195         if initial_data:
1196             def parse_media(media):
1197                 if not media:
1198                     return
1199                 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1200                     item_id = item.get('id')
1201                     item_title = item.get('title')
1202                     if not (item_id and item_title):
1203                         continue
1204                     formats, subtitles = self._download_media_selector(item_id)
1205                     item_desc = None
1206                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1207                     if blocks:
1208                         summary = []
1209                         for block in blocks:
1210                             text = try_get(block, lambda x: x['model']['text'], compat_str)
1211                             if text:
1212                                 summary.append(text)
1213                         if summary:
1214                             item_desc = '\n\n'.join(summary)
1215                     item_time = None
1216                     for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1217                         if try_get(meta, lambda x: x['label']) == 'Published':
1218                             item_time = unified_timestamp(meta.get('timestamp'))
1219                             break
1220                     entries.append({
1221                         'id': item_id,
1222                         'title': item_title,
1223                         'thumbnail': item.get('holdingImageUrl'),
1224                         'formats': formats,
1225                         'subtitles': subtitles,
1226                         'timestamp': item_time,
1227                         'description': strip_or_none(item_desc),
1228                     })
1229             for resp in (initial_data.get('data') or {}).values():
1230                 name = resp.get('name')
1231                 if name == 'media-experience':
1232                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1233                 elif name == 'article':
1234                     for block in (try_get(resp,
1235                                           (lambda x: x['data']['blocks'],
1236                                            lambda x: x['data']['content']['model']['blocks'],),
1237                                           list) or []):
1238                         if block.get('type') not in ['media', 'video']:
1239                             continue
1240                         parse_media(block.get('model'))
1241             return self.playlist_result(
1242                 entries, playlist_id, playlist_title, playlist_description)
1243
1244         def extract_all(pattern):
1245             return list(filter(None, map(
1246                 lambda s: self._parse_json(s, playlist_id, fatal=False),
1247                 re.findall(pattern, webpage))))
1248
1249         # Multiple video article (e.g.
1250         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1251         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1252         entries = []
1253         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1254             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1255             if embed_url and re.match(EMBED_URL, embed_url):
1256                 entries.append(embed_url)
1257         entries.extend(re.findall(
1258             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1259         if entries:
1260             return self.playlist_result(
1261                 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1262                 playlist_id, playlist_title, playlist_description)
1263
1264         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1265         medias = extract_all(r"data-media-meta='({[^']+})'")
1266
1267         if not medias:
1268             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1269             media_asset = self._search_regex(
1270                 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1271                 webpage, 'media asset', default=None)
1272             if media_asset:
1273                 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1274                 medias = []
1275                 for video in media_asset_page.get('videos', {}).values():
1276                     medias.extend(video.values())
1277
1278         if not medias:
1279             # Multiple video playlist with single `now playing` entry (e.g.
1280             # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1281             vxp_playlist = self._parse_json(
1282                 self._search_regex(
1283                     r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1284                     webpage, 'playlist data'),
1285                 playlist_id)
1286             playlist_medias = []
1287             for item in vxp_playlist:
1288                 media = item.get('media')
1289                 if not media:
1290                     continue
1291                 playlist_medias.append(media)
1292                 # Download single video if found media with asset id matching the video id from URL
1293                 if item.get('advert', {}).get('assetId') == playlist_id:
1294                     medias = [media]
1295                     break
1296             # Fallback to the whole playlist
1297             if not medias:
1298                 medias = playlist_medias
1299
1300         entries = []
1301         for num, media_meta in enumerate(medias, start=1):
1302             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1303             if not formats and not self.get_param('ignore_no_formats'):
1304                 continue
1305
1306             video_id = media_meta.get('externalId')
1307             if not video_id:
1308                 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1309
1310             title = media_meta.get('caption')
1311             if not title:
1312                 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1313
1314             duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1315
1316             images = []
1317             for image in media_meta.get('images', {}).values():
1318                 images.extend(image.values())
1319             if 'image' in media_meta:
1320                 images.append(media_meta['image'])
1321
1322             thumbnails = [{
1323                 'url': image.get('href'),
1324                 'width': int_or_none(image.get('width')),
1325                 'height': int_or_none(image.get('height')),
1326             } for image in images]
1327
1328             entries.append({
1329                 'id': video_id,
1330                 'title': title,
1331                 'thumbnails': thumbnails,
1332                 'duration': duration,
1333                 'timestamp': timestamp,
1334                 'formats': formats,
1335                 'subtitles': subtitles,
1336             })
1337
1338         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1339
1340
1341 class BBCCoUkArticleIE(InfoExtractor):
1342     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1343     IE_NAME = 'bbc.co.uk:article'
1344     IE_DESC = 'BBC articles'
1345
1346     _TEST = {
1347         'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1348         'info_dict': {
1349             'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1350             'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1351             'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1352         },
1353         'playlist_count': 4,
1354         'add_ie': ['BBCCoUk'],
1355     }
1356
1357     def _real_extract(self, url):
1358         playlist_id = self._match_id(url)
1359
1360         webpage = self._download_webpage(url, playlist_id)
1361
1362         title = self._og_search_title(webpage)
1363         description = self._og_search_description(webpage).strip()
1364
1365         entries = [self.url_result(programme_url) for programme_url in re.findall(
1366             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1367
1368         return self.playlist_result(entries, playlist_id, title, description)
1369
1370
1371 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1372     def _entries(self, webpage, url, playlist_id):
1373         single_page = 'page' in compat_urlparse.parse_qs(
1374             compat_urlparse.urlparse(url).query)
1375         for page_num in itertools.count(2):
1376             for video_id in re.findall(
1377                     self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1378                 yield self.url_result(
1379                     self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1380             if single_page:
1381                 return
1382             next_page = self._search_regex(
1383                 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1384                 webpage, 'next page url', default=None, group='url')
1385             if not next_page:
1386                 break
1387             webpage = self._download_webpage(
1388                 compat_urlparse.urljoin(url, next_page), playlist_id,
1389                 'Downloading page %d' % page_num, page_num)
1390
1391     def _real_extract(self, url):
1392         playlist_id = self._match_id(url)
1393
1394         webpage = self._download_webpage(url, playlist_id)
1395
1396         title, description = self._extract_title_and_description(webpage)
1397
1398         return self.playlist_result(
1399             self._entries(webpage, url, playlist_id),
1400             playlist_id, title, description)
1401
1402
1403 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1404     _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1405
1406     @staticmethod
1407     def _get_default(episode, key, default_key='default'):
1408         return try_get(episode, lambda x: x[key][default_key])
1409
1410     def _get_description(self, data):
1411         synopsis = data.get(self._DESCRIPTION_KEY) or {}
1412         return dict_get(synopsis, ('large', 'medium', 'small'))
1413
1414     def _fetch_page(self, programme_id, per_page, series_id, page):
1415         elements = self._get_elements(self._call_api(
1416             programme_id, per_page, page + 1, series_id))
1417         for element in elements:
1418             episode = self._get_episode(element)
1419             episode_id = episode.get('id')
1420             if not episode_id:
1421                 continue
1422             thumbnail = None
1423             image = self._get_episode_image(episode)
1424             if image:
1425                 thumbnail = image.replace('{recipe}', 'raw')
1426             category = self._get_default(episode, 'labels', 'category')
1427             yield {
1428                 '_type': 'url',
1429                 'id': episode_id,
1430                 'title': self._get_episode_field(episode, 'subtitle'),
1431                 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1432                 'thumbnail': thumbnail,
1433                 'description': self._get_description(episode),
1434                 'categories': [category] if category else None,
1435                 'series': self._get_episode_field(episode, 'title'),
1436                 'ie_key': BBCCoUkIE.ie_key(),
1437             }
1438
1439     def _real_extract(self, url):
1440         pid = self._match_id(url)
1441         qs = parse_qs(url)
1442         series_id = qs.get('seriesId', [None])[0]
1443         page = qs.get('page', [None])[0]
1444         per_page = 36 if page else self._PAGE_SIZE
1445         fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1446         entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1447         playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1448         return self.playlist_result(
1449             entries, pid, self._get_playlist_title(playlist_data),
1450             self._get_description(playlist_data))
1451
1452
1453 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1454     IE_NAME = 'bbc.co.uk:iplayer:episodes'
1455     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1456     _TESTS = [{
1457         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1458         'info_dict': {
1459             'id': 'b05rcz9v',
1460             'title': 'The Disappearance',
1461             'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1462         },
1463         'playlist_mincount': 8,
1464     }, {
1465         # all seasons
1466         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1467         'info_dict': {
1468             'id': 'b094m5t9',
1469             'title': 'Doctor Foster',
1470             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1471         },
1472         'playlist_mincount': 10,
1473     }, {
1474         # explicit season
1475         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1476         'info_dict': {
1477             'id': 'b094m5t9',
1478             'title': 'Doctor Foster',
1479             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1480         },
1481         'playlist_mincount': 5,
1482     }, {
1483         # all pages
1484         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1485         'info_dict': {
1486             'id': 'm0004c4v',
1487             'title': 'Beechgrove',
1488             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1489         },
1490         'playlist_mincount': 37,
1491     }, {
1492         # explicit page
1493         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1494         'info_dict': {
1495             'id': 'm0004c4v',
1496             'title': 'Beechgrove',
1497             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1498         },
1499         'playlist_mincount': 1,
1500     }]
1501     _PAGE_SIZE = 100
1502     _DESCRIPTION_KEY = 'synopsis'
1503
1504     def _get_episode_image(self, episode):
1505         return self._get_default(episode, 'image')
1506
1507     def _get_episode_field(self, episode, field):
1508         return self._get_default(episode, field)
1509
1510     @staticmethod
1511     def _get_elements(data):
1512         return data['entities']['results']
1513
1514     @staticmethod
1515     def _get_episode(element):
1516         return element.get('episode') or {}
1517
1518     def _call_api(self, pid, per_page, page=1, series_id=None):
1519         variables = {
1520             'id': pid,
1521             'page': page,
1522             'perPage': per_page,
1523         }
1524         if series_id:
1525             variables['sliceId'] = series_id
1526         return self._download_json(
1527             'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1528                 'Content-Type': 'application/json'
1529             }, data=json.dumps({
1530                 'id': '5692d93d5aac8d796a0305e895e61551',
1531                 'variables': variables,
1532             }).encode('utf-8'))['data']['programme']
1533
1534     @staticmethod
1535     def _get_playlist_data(data):
1536         return data
1537
1538     def _get_playlist_title(self, data):
1539         return self._get_default(data, 'title')
1540
1541
1542 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1543     IE_NAME = 'bbc.co.uk:iplayer:group'
1544     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1545     _TESTS = [{
1546         # Available for over a year unlike 30 days for most other programmes
1547         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1548         'info_dict': {
1549             'id': 'p02tcc32',
1550             'title': 'Bohemian Icons',
1551             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1552         },
1553         'playlist_mincount': 10,
1554     }, {
1555         # all pages
1556         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1557         'info_dict': {
1558             'id': 'p081d7j7',
1559             'title': 'Music in Scotland',
1560             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1561         },
1562         'playlist_mincount': 47,
1563     }, {
1564         # explicit page
1565         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1566         'info_dict': {
1567             'id': 'p081d7j7',
1568             'title': 'Music in Scotland',
1569             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1570         },
1571         'playlist_mincount': 11,
1572     }]
1573     _PAGE_SIZE = 200
1574     _DESCRIPTION_KEY = 'synopses'
1575
1576     def _get_episode_image(self, episode):
1577         return self._get_default(episode, 'images', 'standard')
1578
1579     def _get_episode_field(self, episode, field):
1580         return episode.get(field)
1581
1582     @staticmethod
1583     def _get_elements(data):
1584         return data['elements']
1585
1586     @staticmethod
1587     def _get_episode(element):
1588         return element
1589
1590     def _call_api(self, pid, per_page, page=1, series_id=None):
1591         return self._download_json(
1592             'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1593             pid, query={
1594                 'page': page,
1595                 'per_page': per_page,
1596             })['group_episodes']
1597
1598     @staticmethod
1599     def _get_playlist_data(data):
1600         return data['group']
1601
1602     def _get_playlist_title(self, data):
1603         return data.get('title')
1604
1605
1606 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1607     IE_NAME = 'bbc.co.uk:playlist'
1608     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1609     _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1610     _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1611     _TESTS = [{
1612         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1613         'info_dict': {
1614             'id': 'b05rcz9v',
1615             'title': 'The Disappearance - Clips - BBC Four',
1616             'description': 'French thriller serial about a missing teenager.',
1617         },
1618         'playlist_mincount': 7,
1619     }, {
1620         # multipage playlist, explicit page
1621         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1622         'info_dict': {
1623             'id': 'b00mfl7n',
1624             'title': 'Frozen Planet - Clips - BBC One',
1625             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1626         },
1627         'playlist_mincount': 24,
1628     }, {
1629         # multipage playlist, all pages
1630         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1631         'info_dict': {
1632             'id': 'b00mfl7n',
1633             'title': 'Frozen Planet - Clips - BBC One',
1634             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1635         },
1636         'playlist_mincount': 142,
1637     }, {
1638         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1639         'only_matching': True,
1640     }, {
1641         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1642         'only_matching': True,
1643     }, {
1644         'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1645         'only_matching': True,
1646     }]
1647
1648     def _extract_title_and_description(self, webpage):
1649         title = self._og_search_title(webpage, fatal=False)
1650         description = self._og_search_description(webpage)
1651         return title, description