yt_dlp/extractor/bbc.py

   1 import functools
   2 import itertools
   3 import json
   4 import re
   5 import xml.etree.ElementTree
   6
   7 from .common import InfoExtractor
   8 from ..compat import compat_str, compat_urlparse
   9 from ..networking.exceptions import HTTPError
  10 from ..utils import (
  11     ExtractorError,
  12     OnDemandPagedList,
  13     clean_html,
  14     dict_get,
  15     float_or_none,
  16     get_element_by_class,
  17     int_or_none,
  18     join_nonempty,
  19     js_to_json,
  20     parse_duration,
  21     parse_iso8601,
  22     parse_qs,
  23     strip_or_none,
  24     traverse_obj,
  25     try_get,
  26     unescapeHTML,
  27     unified_timestamp,
  28     url_or_none,
  29     urlencode_postdata,
  30     urljoin,
  31 )
  32
  33
  34 class BBCCoUkIE(InfoExtractor):
  35     IE_NAME = 'bbc.co.uk'
  36     IE_DESC = 'BBC iPlayer'
  37     _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  38     _VALID_URL = r'''(?x)
  39                     https?://
  40                         (?:www\.)?bbc\.co\.uk/
  41                         (?:
  42                             programmes/(?!articles/)|
  43                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  44                             music/(?:clips|audiovideo/popular)[/#]|
  45                             radio/player/|
  46                             events/[^/]+/play/[^/]+/
  47                         )
  48                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  49                     ''' % _ID_REGEX
  50     _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  51
  52     _LOGIN_URL = 'https://account.bbc.com/signin'
  53     _NETRC_MACHINE = 'bbc'
  54
  55     _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  56     _MEDIA_SETS = [
  57         # Provides HQ HLS streams with even better quality that pc mediaset but fails
  58         # with geolocation in some cases when it's even not geo restricted at all (e.g.
  59         # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  60         'iptv-all',
  61         'pc',
  62     ]
  63
  64     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  65
  66     _TESTS = [
  67         {
  68             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  69             'info_dict': {
  70                 'id': 'b039d07m',
  71                 'ext': 'flv',
  72                 'title': 'Kaleidoscope, Leonard Cohen',
  73                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  74             },
  75             'params': {
  76                 # rtmp download
  77                 'skip_download': True,
  78             }
  79         },
  80         {
  81             'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  82             'info_dict': {
  83                 'id': 'b00yng1d',
  84                 'ext': 'flv',
  85                 'title': 'The Man in Black: Series 3: The Printed Name',
  86                 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  87                 'duration': 1800,
  88             },
  89             'params': {
  90                 # rtmp download
  91                 'skip_download': True,
  92             },
  93             'skip': 'Episode is no longer available on BBC iPlayer Radio',
  94         },
  95         {
  96             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  97             'info_dict': {
  98                 'id': 'b00yng1d',
  99                 'ext': 'flv',
 100                 'title': 'The Voice UK: Series 3: Blind Auditions 5',
 101                 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
 102                 'duration': 5100,
 103             },
 104             'params': {
 105                 # rtmp download
 106                 'skip_download': True,
 107             },
 108             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 109         },
 110         {
 111             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
 112             'info_dict': {
 113                 'id': 'b03k3pb7',
 114                 'ext': 'flv',
 115                 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
 116                 'description': '2. Invasion',
 117                 'duration': 3600,
 118             },
 119             'params': {
 120                 # rtmp download
 121                 'skip_download': True,
 122             },
 123             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
 124         }, {
 125             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
 126             'info_dict': {
 127                 'id': 'b04v209v',
 128                 'ext': 'flv',
 129                 'title': 'Pete Tong, The Essential New Tune Special',
 130                 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
 131                 'duration': 10800,
 132             },
 133             'params': {
 134                 # rtmp download
 135                 'skip_download': True,
 136             },
 137             'skip': 'Episode is no longer available on BBC iPlayer Radio',
 138         }, {
 139             'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
 140             'note': 'Audio',
 141             'info_dict': {
 142                 'id': 'p022h44j',
 143                 'ext': 'flv',
 144                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
 145                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
 146                 'duration': 227,
 147             },
 148             'params': {
 149                 # rtmp download
 150                 'skip_download': True,
 151             }
 152         }, {
 153             'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
 154             'note': 'Video',
 155             'info_dict': {
 156                 'id': 'p025c103',
 157                 'ext': 'flv',
 158                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
 159                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
 160                 'duration': 226,
 161             },
 162             'params': {
 163                 # rtmp download
 164                 'skip_download': True,
 165             }
 166         }, {
 167             'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
 168             'info_dict': {
 169                 'id': 'p02n76xf',
 170                 'ext': 'flv',
 171                 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
 172                 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
 173                 'duration': 3540,
 174             },
 175             'params': {
 176                 # rtmp download
 177                 'skip_download': True,
 178             },
 179             'skip': 'geolocation',
 180         }, {
 181             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
 182             'info_dict': {
 183                 'id': 'b05zmgw1',
 184                 'ext': 'flv',
 185                 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
 186                 'title': 'Royal Academy Summer Exhibition',
 187                 'duration': 3540,
 188             },
 189             'params': {
 190                 # rtmp download
 191                 'skip_download': True,
 192             },
 193             'skip': 'geolocation',
 194         }, {
 195             # iptv-all mediaset fails with geolocation however there is no geo restriction
 196             # for this programme at all
 197             'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
 198             'info_dict': {
 199                 'id': 'b06rkms3',
 200                 'ext': 'flv',
 201                 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
 202                 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
 203             },
 204             'params': {
 205                 # rtmp download
 206                 'skip_download': True,
 207             },
 208             'skip': 'Now it\'s really geo-restricted',
 209         }, {
 210             # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
 211             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
 212             'info_dict': {
 213                 'id': 'p028bfkj',
 214                 'ext': 'flv',
 215                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 216                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
 217             },
 218             'params': {
 219                 # rtmp download
 220                 'skip_download': True,
 221             },
 222         }, {
 223             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
 224             'only_matching': True,
 225         }, {
 226             'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
 227             'only_matching': True,
 228         }, {
 229             'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
 230             'only_matching': True,
 231         }, {
 232             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
 233             'only_matching': True,
 234         }, {
 235             'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
 236             'only_matching': True,
 237         }, {
 238             'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
 239             'only_matching': True,
 240         }, {
 241             'url': 'https://www.bbc.co.uk/programmes/m00005xn',
 242             'only_matching': True,
 243         }, {
 244             'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
 245             'only_matching': True,
 246         }]
 247
 248     def _perform_login(self, username, password):
 249         login_page = self._download_webpage(
 250             self._LOGIN_URL, None, 'Downloading signin page')
 251
 252         login_form = self._hidden_inputs(login_page)
 253
 254         login_form.update({
 255             'username': username,
 256             'password': password,
 257         })
 258
 259         post_url = urljoin(self._LOGIN_URL, self._search_regex(
 260             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
 261             'post url', default=self._LOGIN_URL, group='url'))
 262
 263         response, urlh = self._download_webpage_handle(
 264             post_url, None, 'Logging in', data=urlencode_postdata(login_form),
 265             headers={'Referer': self._LOGIN_URL})
 266
 267         if self._LOGIN_URL in urlh.url:
 268             error = clean_html(get_element_by_class('form-message', response))
 269             if error:
 270                 raise ExtractorError(
 271                     'Unable to login: %s' % error, expected=True)
 272             raise ExtractorError('Unable to log in')
 273
 274     class MediaSelectionError(Exception):
 275         def __init__(self, id):
 276             self.id = id
 277
 278     def _extract_asx_playlist(self, connection, programme_id):
 279         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
 280         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 281
 282     def _extract_items(self, playlist):
 283         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 284
 285     def _extract_medias(self, media_selection):
 286         error = media_selection.get('result')
 287         if error:
 288             raise BBCCoUkIE.MediaSelectionError(error)
 289         return media_selection.get('media') or []
 290
 291     def _extract_connections(self, media):
 292         return media.get('connection') or []
 293
 294     def _get_subtitles(self, media, programme_id):
 295         subtitles = {}
 296         for connection in self._extract_connections(media):
 297             cc_url = url_or_none(connection.get('href'))
 298             if not cc_url:
 299                 continue
 300             captions = self._download_xml(
 301                 cc_url, programme_id, 'Downloading captions', fatal=False)
 302             if not isinstance(captions, xml.etree.ElementTree.Element):
 303                 continue
 304             subtitles['en'] = [
 305                 {
 306                     'url': connection.get('href'),
 307                     'ext': 'ttml',
 308                 },
 309             ]
 310             break
 311         return subtitles
 312
 313     def _raise_extractor_error(self, media_selection_error):
 314         raise ExtractorError(
 315             '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
 316             expected=True)
 317
 318     def _download_media_selector(self, programme_id):
 319         last_exception = None
 320         formats, subtitles = [], {}
 321         for media_set in self._MEDIA_SETS:
 322             try:
 323                 fmts, subs = self._download_media_selector_url(
 324                     self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
 325                 formats.extend(fmts)
 326                 if subs:
 327                     self._merge_subtitles(subs, target=subtitles)
 328             except BBCCoUkIE.MediaSelectionError as e:
 329                 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
 330                     last_exception = e
 331                     continue
 332                 self._raise_extractor_error(e)
 333         if last_exception:
 334             if formats or subtitles:
 335                 self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
 336             else:
 337                 self._raise_extractor_error(last_exception)
 338         return formats, subtitles
 339
 340     def _download_media_selector_url(self, url, programme_id=None):
 341         media_selection = self._download_json(
 342             url, programme_id, 'Downloading media selection JSON',
 343             expected_status=(403, 404))
 344         return self._process_media_selector(media_selection, programme_id)
 345
 346     def _process_media_selector(self, media_selection, programme_id):
 347         formats = []
 348         subtitles = None
 349         urls = []
 350
 351         for media in self._extract_medias(media_selection):
 352             kind = media.get('kind')
 353             if kind in ('video', 'audio'):
 354                 bitrate = int_or_none(media.get('bitrate'))
 355                 encoding = media.get('encoding')
 356                 width = int_or_none(media.get('width'))
 357                 height = int_or_none(media.get('height'))
 358                 file_size = int_or_none(media.get('media_file_size'))
 359                 for connection in self._extract_connections(media):
 360                     href = connection.get('href')
 361                     if href in urls:
 362                         continue
 363                     if href:
 364                         urls.append(href)
 365                     conn_kind = connection.get('kind')
 366                     protocol = connection.get('protocol')
 367                     supplier = connection.get('supplier')
 368                     transfer_format = connection.get('transferFormat')
 369                     format_id = supplier or conn_kind or protocol
 370                     # ASX playlist
 371                     if supplier == 'asx':
 372                         for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
 373                             formats.append({
 374                                 'url': ref,
 375                                 'format_id': 'ref%s_%s' % (i, format_id),
 376                             })
 377                     elif transfer_format == 'dash':
 378                         formats.extend(self._extract_mpd_formats(
 379                             href, programme_id, mpd_id=format_id, fatal=False))
 380                     elif transfer_format == 'hls':
 381                         # TODO: let expected_status be passed into _extract_xxx_formats() instead
 382                         try:
 383                             fmts = self._extract_m3u8_formats(
 384                                 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
 385                                 m3u8_id=format_id, fatal=False)
 386                         except ExtractorError as e:
 387                             if not (isinstance(e.exc_info[1], HTTPError)
 388                                     and e.exc_info[1].status in (403, 404)):
 389                                 raise
 390                             fmts = []
 391                         formats.extend(fmts)
 392                     elif transfer_format == 'hds':
 393                         formats.extend(self._extract_f4m_formats(
 394                             href, programme_id, f4m_id=format_id, fatal=False))
 395                     else:
 396                         if not supplier and bitrate:
 397                             format_id += '-%d' % bitrate
 398                         fmt = {
 399                             'format_id': format_id,
 400                             'filesize': file_size,
 401                         }
 402                         if kind == 'video':
 403                             fmt.update({
 404                                 'width': width,
 405                                 'height': height,
 406                                 'tbr': bitrate,
 407                                 'vcodec': encoding,
 408                             })
 409                         else:
 410                             fmt.update({
 411                                 'abr': bitrate,
 412                                 'acodec': encoding,
 413                                 'vcodec': 'none',
 414                             })
 415                         if protocol in ('http', 'https'):
 416                             # Direct link
 417                             fmt.update({
 418                                 'url': href,
 419                             })
 420                         elif protocol == 'rtmp':
 421                             application = connection.get('application', 'ondemand')
 422                             auth_string = connection.get('authString')
 423                             identifier = connection.get('identifier')
 424                             server = connection.get('server')
 425                             fmt.update({
 426                                 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
 427                                 'play_path': identifier,
 428                                 'app': '%s?%s' % (application, auth_string),
 429                                 'page_url': 'http://www.bbc.co.uk',
 430                                 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
 431                                 'rtmp_live': False,
 432                                 'ext': 'flv',
 433                             })
 434                         else:
 435                             continue
 436                         formats.append(fmt)
 437             elif kind == 'captions':
 438                 subtitles = self.extract_subtitles(media, programme_id)
 439         return formats, subtitles
 440
 441     def _download_playlist(self, playlist_id):
 442         try:
 443             playlist = self._download_json(
 444                 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
 445                 playlist_id, 'Downloading playlist JSON')
 446             formats = []
 447             subtitles = {}
 448
 449             for version in playlist.get('allAvailableVersions', []):
 450                 smp_config = version['smpConfig']
 451                 title = smp_config['title']
 452                 description = smp_config['summary']
 453                 for item in smp_config['items']:
 454                     kind = item['kind']
 455                     if kind not in ('programme', 'radioProgramme'):
 456                         continue
 457                     programme_id = item.get('vpid')
 458                     duration = int_or_none(item.get('duration'))
 459                     version_formats, version_subtitles = self._download_media_selector(programme_id)
 460                     types = version['types']
 461                     for f in version_formats:
 462                         f['format_note'] = ', '.join(types)
 463                         if any('AudioDescribed' in x for x in types):
 464                             f['language_preference'] = -10
 465                     formats += version_formats
 466                     for tag, subformats in (version_subtitles or {}).items():
 467                         subtitles.setdefault(tag, []).extend(subformats)
 468
 469             return programme_id, title, description, duration, formats, subtitles
 470         except ExtractorError as ee:
 471             if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
 472                 raise
 473
 474         # fallback to legacy playlist
 475         return self._process_legacy_playlist(playlist_id)
 476
 477     def _process_legacy_playlist_url(self, url, display_id):
 478         playlist = self._download_legacy_playlist_url(url, display_id)
 479         return self._extract_from_legacy_playlist(playlist, display_id)
 480
 481     def _process_legacy_playlist(self, playlist_id):
 482         return self._process_legacy_playlist_url(
 483             'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
 484
 485     def _download_legacy_playlist_url(self, url, playlist_id=None):
 486         return self._download_xml(
 487             url, playlist_id, 'Downloading legacy playlist XML')
 488
 489     def _extract_from_legacy_playlist(self, playlist, playlist_id):
 490         no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
 491         if no_items is not None:
 492             reason = no_items.get('reason')
 493             if reason == 'preAvailability':
 494                 msg = 'Episode %s is not yet available' % playlist_id
 495             elif reason == 'postAvailability':
 496                 msg = 'Episode %s is no longer available' % playlist_id
 497             elif reason == 'noMedia':
 498                 msg = 'Episode %s is not currently available' % playlist_id
 499             else:
 500                 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
 501             raise ExtractorError(msg, expected=True)
 502
 503         for item in self._extract_items(playlist):
 504             kind = item.get('kind')
 505             if kind not in ('programme', 'radioProgramme'):
 506                 continue
 507             title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
 508             description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
 509             description = description_el.text if description_el is not None else None
 510
 511             def get_programme_id(item):
 512                 def get_from_attributes(item):
 513                     for p in ('identifier', 'group'):
 514                         value = item.get(p)
 515                         if value and re.match(r'^[pb][\da-z]{7}$', value):
 516                             return value
 517                 get_from_attributes(item)
 518                 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
 519                 if mediator is not None:
 520                     return get_from_attributes(mediator)
 521
 522             programme_id = get_programme_id(item)
 523             duration = int_or_none(item.get('duration'))
 524
 525             if programme_id:
 526                 formats, subtitles = self._download_media_selector(programme_id)
 527             else:
 528                 formats, subtitles = self._process_media_selector(item, playlist_id)
 529                 programme_id = playlist_id
 530
 531         return programme_id, title, description, duration, formats, subtitles
 532
 533     def _real_extract(self, url):
 534         group_id = self._match_id(url)
 535
 536         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 537
 538         error = self._search_regex(
 539             r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
 540             webpage, 'error', default=None)
 541         if error:
 542             raise ExtractorError(error, expected=True)
 543
 544         programme_id = None
 545         duration = None
 546
 547         tviplayer = self._search_regex(
 548             r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
 549             webpage, 'player', default=None)
 550
 551         if tviplayer:
 552             player = self._parse_json(tviplayer, group_id).get('player', {})
 553             duration = int_or_none(player.get('duration'))
 554             programme_id = player.get('vpid')
 555
 556         if not programme_id:
 557             programme_id = self._search_regex(
 558                 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
 559
 560         if programme_id:
 561             formats, subtitles = self._download_media_selector(programme_id)
 562             title = self._og_search_title(webpage, default=None) or self._html_search_regex(
 563                 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
 564                  r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
 565             description = self._search_regex(
 566                 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
 567                  r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
 568                 webpage, 'description', default=None)
 569             if not description:
 570                 description = self._html_search_meta('description', webpage)
 571         else:
 572             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 573
 574         return {
 575             'id': programme_id,
 576             'title': title,
 577             'description': description,
 578             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 579             'duration': duration,
 580             'formats': formats,
 581             'subtitles': subtitles,
 582         }
 583
 584
 585 class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
 586     IE_NAME = 'bbc'
 587     IE_DESC = 'BBC'
 588     _VALID_URL = r'''(?x)
 589         https?://(?:www\.)?(?:
 590             bbc\.(?:com|co\.uk)|
 591             bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
 592             bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
 593         )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
 594
 595     _MEDIA_SETS = [
 596         'pc',
 597         'mobile-tablet-main',
 598     ]
 599
 600     _TESTS = [{
 601         # article with multiple videos embedded with data-playable containing vpids
 602         'url': 'http://www.bbc.com/news/world-europe-32668511',
 603         'info_dict': {
 604             'id': 'world-europe-32668511',
 605             'title': 'Russia stages massive WW2 parade',
 606             'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
 607         },
 608         'playlist_count': 2,
 609     }, {
 610         # article with multiple videos embedded with data-playable (more videos)
 611         'url': 'http://www.bbc.com/news/business-28299555',
 612         'info_dict': {
 613             'id': 'business-28299555',
 614             'title': 'Farnborough Airshow: Video highlights',
 615             'description': 'BBC reports and video highlights at the Farnborough Airshow.',
 616         },
 617         'playlist_count': 9,
 618         'skip': 'Save time',
 619     }, {
 620         # article with multiple videos embedded with `new SMP()`
 621         # broken
 622         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
 623         'info_dict': {
 624             'id': '3662a707-0af9-3149-963f-47bea720b460',
 625             'title': 'BUGGER',
 626         },
 627         'playlist_count': 18,
 628     }, {
 629         # single video embedded with data-playable containing vpid
 630         'url': 'http://www.bbc.com/news/world-europe-32041533',
 631         'info_dict': {
 632             'id': 'p02mprgb',
 633             'ext': 'mp4',
 634             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 635             'description': 'md5:2868290467291b37feda7863f7a83f54',
 636             'duration': 47,
 637             'timestamp': 1427219242,
 638             'upload_date': '20150324',
 639         },
 640         'params': {
 641             # rtmp download
 642             'skip_download': True,
 643         }
 644     }, {
 645         # article with single video embedded with data-playable containing XML playlist
 646         # with direct video links as progressiveDownloadUrl (for now these are extracted)
 647         # and playlist with f4m and m3u8 as streamingUrl
 648         'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 649         'info_dict': {
 650             'id': '150615_telabyad_kentin_cogu',
 651             'ext': 'mp4',
 652             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
 653             'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
 654             'timestamp': 1434397334,
 655             'upload_date': '20150615',
 656         },
 657         'params': {
 658             'skip_download': True,
 659         }
 660     }, {
 661         # single video embedded with data-playable containing XML playlists (regional section)
 662         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 663         'info_dict': {
 664             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
 665             'ext': 'mp4',
 666             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
 667             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
 668             'timestamp': 1434713142,
 669             'upload_date': '20150619',
 670         },
 671         'params': {
 672             'skip_download': True,
 673         }
 674     }, {
 675         # single video from video playlist embedded with vxp-playlist-data JSON
 676         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
 677         'info_dict': {
 678             'id': 'p02w6qjc',
 679             'ext': 'mp4',
 680             'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 681             'duration': 56,
 682             'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
 683         },
 684         'params': {
 685             'skip_download': True,
 686         }
 687     }, {
 688         # single video story with digitalData
 689         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
 690         'info_dict': {
 691             'id': 'p02q6gc4',
 692             'ext': 'flv',
 693             'title': 'Sri Lanka’s spicy secret',
 694             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
 695             'timestamp': 1437674293,
 696             'upload_date': '20150723',
 697         },
 698         'params': {
 699             # rtmp download
 700             'skip_download': True,
 701         }
 702     }, {
 703         # single video story without digitalData
 704         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
 705         'info_dict': {
 706             'id': 'p018zqqg',
 707             'ext': 'mp4',
 708             'title': 'Hyundai Santa Fe Sport: Rock star',
 709             'description': 'md5:b042a26142c4154a6e472933cf20793d',
 710             'timestamp': 1415867444,
 711             'upload_date': '20141113',
 712         },
 713         'params': {
 714             # rtmp download
 715             'skip_download': True,
 716         }
 717     }, {
 718         # single video embedded with Morph
 719         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
 720         'info_dict': {
 721             'id': 'p041vhd0',
 722             'ext': 'mp4',
 723             'title': "Nigeria v Japan - Men's First Round",
 724             'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
 725             'duration': 7980,
 726             'uploader': 'BBC Sport',
 727             'uploader_id': 'bbc_sport',
 728         },
 729         'params': {
 730             # m3u8 download
 731             'skip_download': True,
 732         },
 733         'skip': 'Georestricted to UK',
 734     }, {
 735         # single video with playlist.sxml URL in playlist param
 736         'url': 'http://www.bbc.com/sport/0/football/33653409',
 737         'info_dict': {
 738             'id': 'p02xycnp',
 739             'ext': 'mp4',
 740             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
 741             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
 742             'duration': 140,
 743         },
 744         'params': {
 745             # rtmp download
 746             'skip_download': True,
 747         }
 748     }, {
 749         # article with multiple videos embedded with playlist.sxml in playlist param
 750         'url': 'http://www.bbc.com/sport/0/football/34475836',
 751         'info_dict': {
 752             'id': '34475836',
 753             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
 754             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
 755         },
 756         'playlist_count': 3,
 757     }, {
 758         # school report article with single video
 759         'url': 'http://www.bbc.co.uk/schoolreport/35744779',
 760         'info_dict': {
 761             'id': '35744779',
 762             'title': 'School which breaks down barriers in Jerusalem',
 763         },
 764         'playlist_count': 1,
 765     }, {
 766         # single video with playlist URL from weather section
 767         'url': 'http://www.bbc.com/weather/features/33601775',
 768         'only_matching': True,
 769     }, {
 770         # custom redirection to www.bbc.com
 771         # also, video with window.__INITIAL_DATA__
 772         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
 773         'info_dict': {
 774             'id': 'p02xzws1',
 775             'ext': 'mp4',
 776             'title': "Pluto may have 'nitrogen glaciers'",
 777             'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
 778             'thumbnail': r're:https?://.+/.+\.jpg',
 779             'timestamp': 1437785037,
 780             'upload_date': '20150725',
 781         },
 782     }, {
 783         # video with window.__INITIAL_DATA__ and value as JSON string
 784         'url': 'https://www.bbc.com/news/av/world-europe-59468682',
 785         'info_dict': {
 786             'id': 'p0b71qth',
 787             'ext': 'mp4',
 788             'title': 'Why France is making this woman a national hero',
 789             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
 790             'thumbnail': r're:https?://.+/.+\.jpg',
 791             'timestamp': 1638230731,
 792             'upload_date': '20211130',
 793         },
 794     }, {
 795         # single video article embedded with data-media-vpid
 796         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
 797         'only_matching': True,
 798     }, {
 799         # bbcthreeConfig
 800         'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
 801         'info_dict': {
 802             'id': 'p06556y7',
 803             'ext': 'mp4',
 804             'title': 'Things Not To Say to people that live on council estates',
 805             'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
 806             'duration': 360,
 807             'thumbnail': r're:https?://.+/.+\.jpg',
 808         },
 809     }, {
 810         # window.__PRELOADED_STATE__
 811         'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
 812         'info_dict': {
 813             'id': 'b0b9z4vz',
 814             'ext': 'mp4',
 815             'title': 'Prom 6: An American in Paris and Turangalila',
 816             'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
 817             'uploader': 'Radio 3',
 818             'uploader_id': 'bbc_radio_three',
 819         },
 820     }, {
 821         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
 822         'info_dict': {
 823             'id': 'p06w9tws',
 824             'ext': 'mp4',
 825             'title': 'md5:2fabf12a726603193a2879a055f72514',
 826             'description': 'Learn English words and phrases from this story',
 827         },
 828         'add_ie': [BBCCoUkIE.ie_key()],
 829     }, {
 830         # BBC Reel
 831         'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
 832         'info_dict': {
 833             'id': 'p07c6sb9',
 834             'ext': 'mp4',
 835             'title': 'How positive thinking is harming your happiness',
 836             'alt_title': 'The downsides of positive thinking',
 837             'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
 838             'duration': 235,
 839             'thumbnail': r're:https?://.+/p07c9dsr.jpg',
 840             'upload_date': '20190604',
 841             'categories': ['Psychology'],
 842         },
 843     }, {
 844         # BBC Sounds
 845         'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
 846         'info_dict': {
 847             'id': 'm001q789',
 848             'ext': 'mp4',
 849             'title': 'The Night Tracks Mix - Music for the darkling hour',
 850             'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
 851             'chapters': 'count:8',
 852             'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
 853             'uploader': 'Radio 3',
 854             'duration': 1800,
 855             'uploader_id': 'bbc_radio_three',
 856         },
 857     }, {  # onion routes
 858         'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
 859         'only_matching': True,
 860     }, {
 861         'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
 862         'only_matching': True,
 863     }]
 864
 865     @classmethod
 866     def suitable(cls, url):
 867         EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
 868         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
 869                 else super(BBCIE, cls).suitable(url))
 870
 871     def _extract_from_media_meta(self, media_meta, video_id):
 872         # Direct links to media in media metadata (e.g.
 873         # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 874         # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
 875         source_files = media_meta.get('sourceFiles')
 876         if source_files:
 877             return [{
 878                 'url': f['url'],
 879                 'format_id': format_id,
 880                 'ext': f.get('encoding'),
 881                 'tbr': float_or_none(f.get('bitrate'), 1000),
 882                 'filesize': int_or_none(f.get('filesize')),
 883             } for format_id, f in source_files.items() if f.get('url')], []
 884
 885         programme_id = media_meta.get('externalId')
 886         if programme_id:
 887             return self._download_media_selector(programme_id)
 888
 889         # Process playlist.sxml as legacy playlist
 890         href = media_meta.get('href')
 891         if href:
 892             playlist = self._download_legacy_playlist_url(href)
 893             _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
 894             return formats, subtitles
 895
 896         return [], []
 897
 898     def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
 899         programme_id, title, description, duration, formats, subtitles = \
 900             self._process_legacy_playlist_url(url, playlist_id)
 901         return {
 902             'id': programme_id,
 903             'title': title,
 904             'description': description,
 905             'duration': duration,
 906             'timestamp': timestamp,
 907             'formats': formats,
 908             'subtitles': subtitles,
 909         }
 910
 911     def _real_extract(self, url):
 912         playlist_id = self._match_id(url)
 913
 914         webpage = self._download_webpage(url, playlist_id)
 915
 916         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
 917         timestamp = json_ld_info.get('timestamp')
 918
 919         playlist_title = json_ld_info.get('title') or re.sub(
 920             r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
 921
 922         playlist_description = json_ld_info.get(
 923             'description') or self._og_search_description(webpage, default=None)
 924
 925         if not timestamp:
 926             timestamp = parse_iso8601(self._search_regex(
 927                 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
 928                  r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
 929                  r'"datePublished":\s*"([^"]+)'],
 930                 webpage, 'date', default=None))
 931
 932         entries = []
 933
 934         # article with multiple videos embedded with playlist.sxml (e.g.
 935         # http://www.bbc.com/sport/0/football/34475836)
 936         playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
 937         playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
 938         if playlists:
 939             entries = [
 940                 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
 941                 for playlist_url in playlists]
 942
 943         # news article with multiple videos embedded with data-playable
 944         data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
 945         if data_playables:
 946             for _, data_playable_json in data_playables:
 947                 data_playable = self._parse_json(
 948                     unescapeHTML(data_playable_json), playlist_id, fatal=False)
 949                 if not data_playable:
 950                     continue
 951                 settings = data_playable.get('settings', {})
 952                 if settings:
 953                     # data-playable with video vpid in settings.playlistObject.items (e.g.
 954                     # http://www.bbc.com/news/world-us-canada-34473351)
 955                     playlist_object = settings.get('playlistObject', {})
 956                     if playlist_object:
 957                         items = playlist_object.get('items')
 958                         if items and isinstance(items, list):
 959                             title = playlist_object['title']
 960                             description = playlist_object.get('summary')
 961                             duration = int_or_none(items[0].get('duration'))
 962                             programme_id = items[0].get('vpid')
 963                             formats, subtitles = self._download_media_selector(programme_id)
 964                             entries.append({
 965                                 'id': programme_id,
 966                                 'title': title,
 967                                 'description': description,
 968                                 'timestamp': timestamp,
 969                                 'duration': duration,
 970                                 'formats': formats,
 971                                 'subtitles': subtitles,
 972                             })
 973                     else:
 974                         # data-playable without vpid but with a playlist.sxml URLs
 975                         # in otherSettings.playlist (e.g.
 976                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
 977                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
 978                         if playlist:
 979                             entry = None
 980                             for key in ('streaming', 'progressiveDownload'):
 981                                 playlist_url = playlist.get('%sUrl' % key)
 982                                 if not playlist_url:
 983                                     continue
 984                                 try:
 985                                     info = self._extract_from_playlist_sxml(
 986                                         playlist_url, playlist_id, timestamp)
 987                                     if not entry:
 988                                         entry = info
 989                                     else:
 990                                         entry['title'] = info['title']
 991                                         entry['formats'].extend(info['formats'])
 992                                 except ExtractorError as e:
 993                                     # Some playlist URL may fail with 500, at the same time
 994                                     # the other one may work fine (e.g.
 995                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
 996                                     if isinstance(e.cause, HTTPError) and e.cause.status == 500:
 997                                         continue
 998                                     raise
 999                             if entry:
1000                                 entries.append(entry)
1001
1002         if entries:
1003             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1004
1005         # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
1006         group_id = self._search_regex(
1007             r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
1008             webpage, 'group id', default=None)
1009         if group_id:
1010             return self.url_result(
1011                 'https://www.bbc.co.uk/programmes/%s' % group_id,
1012                 ie=BBCCoUkIE.ie_key())
1013
1014         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1015         programme_id = self._search_regex(
1016             [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1017              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1018              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1019             webpage, 'vpid', default=None)
1020
1021         if programme_id:
1022             formats, subtitles = self._download_media_selector(programme_id)
1023             # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1024             digital_data = self._parse_json(
1025                 self._search_regex(
1026                     r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1027                 programme_id, fatal=False)
1028             page_info = digital_data.get('page', {}).get('pageInfo', {})
1029             title = page_info.get('pageName') or self._og_search_title(webpage)
1030             description = page_info.get('description') or self._og_search_description(webpage)
1031             timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1032             return {
1033                 'id': programme_id,
1034                 'title': title,
1035                 'description': description,
1036                 'timestamp': timestamp,
1037                 'formats': formats,
1038                 'subtitles': subtitles,
1039             }
1040
1041         # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1042         initial_data = self._parse_json(self._html_search_regex(
1043             r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1044             webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1045         if initial_data:
1046             init_data = try_get(
1047                 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1048             smp_data = init_data.get('smpData') or {}
1049             clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1050             version_id = clip_data.get('versionID')
1051             if version_id:
1052                 title = smp_data['title']
1053                 formats, subtitles = self._download_media_selector(version_id)
1054                 image_url = smp_data.get('holdingImageURL')
1055                 display_date = init_data.get('displayDate')
1056                 topic_title = init_data.get('topicTitle')
1057
1058                 return {
1059                     'id': version_id,
1060                     'title': title,
1061                     'formats': formats,
1062                     'alt_title': init_data.get('shortTitle'),
1063                     'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1064                     'description': smp_data.get('summary') or init_data.get('shortSummary'),
1065                     'upload_date': display_date.replace('-', '') if display_date else None,
1066                     'subtitles': subtitles,
1067                     'duration': int_or_none(clip_data.get('duration')),
1068                     'categories': [topic_title] if topic_title else None,
1069                 }
1070
1071         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1072         # There are several setPayload calls may be present but the video
1073         # seems to be always related to the first one
1074         morph_payload = self._parse_json(
1075             self._search_regex(
1076                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1077                 webpage, 'morph payload', default='{}'),
1078             playlist_id, fatal=False)
1079         if morph_payload:
1080             components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1081             for component in components:
1082                 if not isinstance(component, dict):
1083                     continue
1084                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1085                 if not lead_media:
1086                     continue
1087                 identifiers = lead_media.get('identifiers')
1088                 if not identifiers or not isinstance(identifiers, dict):
1089                     continue
1090                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1091                 if not programme_id:
1092                     continue
1093                 title = lead_media.get('title') or self._og_search_title(webpage)
1094                 formats, subtitles = self._download_media_selector(programme_id)
1095                 description = lead_media.get('summary')
1096                 uploader = lead_media.get('masterBrand')
1097                 uploader_id = lead_media.get('mid')
1098                 duration = None
1099                 duration_d = lead_media.get('duration')
1100                 if isinstance(duration_d, dict):
1101                     duration = parse_duration(dict_get(
1102                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1103                 return {
1104                     'id': programme_id,
1105                     'title': title,
1106                     'description': description,
1107                     'duration': duration,
1108                     'uploader': uploader,
1109                     'uploader_id': uploader_id,
1110                     'formats': formats,
1111                     'subtitles': subtitles,
1112                 }
1113
1114         preload_state = self._parse_json(self._search_regex(
1115             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1116             'preload state', default='{}'), playlist_id, fatal=False)
1117         if preload_state:
1118             current_programme = preload_state.get('programmes', {}).get('current') or {}
1119             programme_id = current_programme.get('id')
1120             if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1121                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1122                 formats, subtitles = self._download_media_selector(programme_id)
1123                 synopses = current_programme.get('synopses') or {}
1124                 network = current_programme.get('network') or {}
1125                 duration = int_or_none(
1126                     current_programme.get('duration', {}).get('value'))
1127                 thumbnail = None
1128                 image_url = current_programme.get('image_url')
1129                 if image_url:
1130                     thumbnail = image_url.replace('{recipe}', 'raw')
1131                 return {
1132                     'id': programme_id,
1133                     'title': title,
1134                     'description': dict_get(synopses, ('long', 'medium', 'short')),
1135                     'thumbnail': thumbnail,
1136                     'duration': duration,
1137                     'uploader': network.get('short_title'),
1138                     'uploader_id': network.get('id'),
1139                     'formats': formats,
1140                     'subtitles': subtitles,
1141                     'chapters': traverse_obj(preload_state, (
1142                         'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
1143                             'title': ('titles', {lambda x: join_nonempty(
1144                                 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
1145                             'start_time': ('offset', 'start', {float_or_none}),
1146                             'end_time': ('offset', 'end', {float_or_none}),
1147                         })) or None,
1148                 }
1149
1150         bbc3_config = self._parse_json(
1151             self._search_regex(
1152                 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1153                 'bbcthree config', default='{}'),
1154             playlist_id, transform_source=js_to_json, fatal=False) or {}
1155         payload = bbc3_config.get('payload') or {}
1156         if payload:
1157             clip = payload.get('currentClip') or {}
1158             clip_vpid = clip.get('vpid')
1159             clip_title = clip.get('title')
1160             if clip_vpid and clip_title:
1161                 formats, subtitles = self._download_media_selector(clip_vpid)
1162                 return {
1163                     'id': clip_vpid,
1164                     'title': clip_title,
1165                     'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1166                     'description': clip.get('description'),
1167                     'duration': parse_duration(clip.get('duration')),
1168                     'formats': formats,
1169                     'subtitles': subtitles,
1170                 }
1171             bbc3_playlist = try_get(
1172                 payload, lambda x: x['content']['bbcMedia']['playlist'],
1173                 dict)
1174             if bbc3_playlist:
1175                 playlist_title = bbc3_playlist.get('title') or playlist_title
1176                 thumbnail = bbc3_playlist.get('holdingImageURL')
1177                 entries = []
1178                 for bbc3_item in bbc3_playlist['items']:
1179                     programme_id = bbc3_item.get('versionID')
1180                     if not programme_id:
1181                         continue
1182                     formats, subtitles = self._download_media_selector(programme_id)
1183                     entries.append({
1184                         'id': programme_id,
1185                         'title': playlist_title,
1186                         'thumbnail': thumbnail,
1187                         'timestamp': timestamp,
1188                         'formats': formats,
1189                         'subtitles': subtitles,
1190                     })
1191                 return self.playlist_result(
1192                     entries, playlist_id, playlist_title, playlist_description)
1193
1194         initial_data = self._search_regex(
1195             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1196             'quoted preload state', default=None)
1197         if initial_data is None:
1198             initial_data = self._search_regex(
1199                 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1200                 'preload state', default='{}')
1201         else:
1202             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1203         initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1204         if initial_data:
1205             def parse_media(media):
1206                 if not media:
1207                     return
1208                 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1209                     item_id = item.get('id')
1210                     item_title = item.get('title')
1211                     if not (item_id and item_title):
1212                         continue
1213                     formats, subtitles = self._download_media_selector(item_id)
1214                     item_desc = None
1215                     blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1216                     if blocks:
1217                         summary = []
1218                         for block in blocks:
1219                             text = try_get(block, lambda x: x['model']['text'], compat_str)
1220                             if text:
1221                                 summary.append(text)
1222                         if summary:
1223                             item_desc = '\n\n'.join(summary)
1224                     item_time = None
1225                     for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1226                         if try_get(meta, lambda x: x['label']) == 'Published':
1227                             item_time = unified_timestamp(meta.get('timestamp'))
1228                             break
1229                     entries.append({
1230                         'id': item_id,
1231                         'title': item_title,
1232                         'thumbnail': item.get('holdingImageUrl'),
1233                         'formats': formats,
1234                         'subtitles': subtitles,
1235                         'timestamp': item_time,
1236                         'description': strip_or_none(item_desc),
1237                     })
1238             for resp in (initial_data.get('data') or {}).values():
1239                 name = resp.get('name')
1240                 if name == 'media-experience':
1241                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1242                 elif name == 'article':
1243                     for block in (try_get(resp,
1244                                           (lambda x: x['data']['blocks'],
1245                                            lambda x: x['data']['content']['model']['blocks'],),
1246                                           list) or []):
1247                         if block.get('type') not in ['media', 'video']:
1248                             continue
1249                         parse_media(block.get('model'))
1250             return self.playlist_result(
1251                 entries, playlist_id, playlist_title, playlist_description)
1252
1253         def extract_all(pattern):
1254             return list(filter(None, map(
1255                 lambda s: self._parse_json(s, playlist_id, fatal=False),
1256                 re.findall(pattern, webpage))))
1257
1258         # Multiple video article (e.g.
1259         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1260         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1261         entries = []
1262         for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1263             embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1264             if embed_url and re.match(EMBED_URL, embed_url):
1265                 entries.append(embed_url)
1266         entries.extend(re.findall(
1267             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1268         if entries:
1269             return self.playlist_result(
1270                 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1271                 playlist_id, playlist_title, playlist_description)
1272
1273         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1274         medias = extract_all(r"data-media-meta='({[^']+})'")
1275
1276         if not medias:
1277             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1278             media_asset = self._search_regex(
1279                 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1280                 webpage, 'media asset', default=None)
1281             if media_asset:
1282                 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1283                 medias = []
1284                 for video in media_asset_page.get('videos', {}).values():
1285                     medias.extend(video.values())
1286
1287         if not medias:
1288             # Multiple video playlist with single `now playing` entry (e.g.
1289             # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1290             vxp_playlist = self._parse_json(
1291                 self._search_regex(
1292                     r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1293                     webpage, 'playlist data'),
1294                 playlist_id)
1295             playlist_medias = []
1296             for item in vxp_playlist:
1297                 media = item.get('media')
1298                 if not media:
1299                     continue
1300                 playlist_medias.append(media)
1301                 # Download single video if found media with asset id matching the video id from URL
1302                 if item.get('advert', {}).get('assetId') == playlist_id:
1303                     medias = [media]
1304                     break
1305             # Fallback to the whole playlist
1306             if not medias:
1307                 medias = playlist_medias
1308
1309         entries = []
1310         for num, media_meta in enumerate(medias, start=1):
1311             formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1312             if not formats and not self.get_param('ignore_no_formats'):
1313                 continue
1314
1315             video_id = media_meta.get('externalId')
1316             if not video_id:
1317                 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1318
1319             title = media_meta.get('caption')
1320             if not title:
1321                 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1322
1323             duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1324
1325             images = []
1326             for image in media_meta.get('images', {}).values():
1327                 images.extend(image.values())
1328             if 'image' in media_meta:
1329                 images.append(media_meta['image'])
1330
1331             thumbnails = [{
1332                 'url': image.get('href'),
1333                 'width': int_or_none(image.get('width')),
1334                 'height': int_or_none(image.get('height')),
1335             } for image in images]
1336
1337             entries.append({
1338                 'id': video_id,
1339                 'title': title,
1340                 'thumbnails': thumbnails,
1341                 'duration': duration,
1342                 'timestamp': timestamp,
1343                 'formats': formats,
1344                 'subtitles': subtitles,
1345             })
1346
1347         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1348
1349
1350 class BBCCoUkArticleIE(InfoExtractor):
1351     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1352     IE_NAME = 'bbc.co.uk:article'
1353     IE_DESC = 'BBC articles'
1354
1355     _TEST = {
1356         'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1357         'info_dict': {
1358             'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1359             'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1360             'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1361         },
1362         'playlist_count': 4,
1363         'add_ie': ['BBCCoUk'],
1364     }
1365
1366     def _real_extract(self, url):
1367         playlist_id = self._match_id(url)
1368
1369         webpage = self._download_webpage(url, playlist_id)
1370
1371         title = self._og_search_title(webpage)
1372         description = self._og_search_description(webpage).strip()
1373
1374         entries = [self.url_result(programme_url) for programme_url in re.findall(
1375             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1376
1377         return self.playlist_result(entries, playlist_id, title, description)
1378
1379
1380 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1381     def _entries(self, webpage, url, playlist_id):
1382         single_page = 'page' in compat_urlparse.parse_qs(
1383             compat_urlparse.urlparse(url).query)
1384         for page_num in itertools.count(2):
1385             for video_id in re.findall(
1386                     self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1387                 yield self.url_result(
1388                     self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1389             if single_page:
1390                 return
1391             next_page = self._search_regex(
1392                 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1393                 webpage, 'next page url', default=None, group='url')
1394             if not next_page:
1395                 break
1396             webpage = self._download_webpage(
1397                 compat_urlparse.urljoin(url, next_page), playlist_id,
1398                 'Downloading page %d' % page_num, page_num)
1399
1400     def _real_extract(self, url):
1401         playlist_id = self._match_id(url)
1402
1403         webpage = self._download_webpage(url, playlist_id)
1404
1405         title, description = self._extract_title_and_description(webpage)
1406
1407         return self.playlist_result(
1408             self._entries(webpage, url, playlist_id),
1409             playlist_id, title, description)
1410
1411
1412 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1413     _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1414
1415     @staticmethod
1416     def _get_default(episode, key, default_key='default'):
1417         return try_get(episode, lambda x: x[key][default_key])
1418
1419     def _get_description(self, data):
1420         synopsis = data.get(self._DESCRIPTION_KEY) or {}
1421         return dict_get(synopsis, ('large', 'medium', 'small'))
1422
1423     def _fetch_page(self, programme_id, per_page, series_id, page):
1424         elements = self._get_elements(self._call_api(
1425             programme_id, per_page, page + 1, series_id))
1426         for element in elements:
1427             episode = self._get_episode(element)
1428             episode_id = episode.get('id')
1429             if not episode_id:
1430                 continue
1431             thumbnail = None
1432             image = self._get_episode_image(episode)
1433             if image:
1434                 thumbnail = image.replace('{recipe}', 'raw')
1435             category = self._get_default(episode, 'labels', 'category')
1436             yield {
1437                 '_type': 'url',
1438                 'id': episode_id,
1439                 'title': self._get_episode_field(episode, 'subtitle'),
1440                 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1441                 'thumbnail': thumbnail,
1442                 'description': self._get_description(episode),
1443                 'categories': [category] if category else None,
1444                 'series': self._get_episode_field(episode, 'title'),
1445                 'ie_key': BBCCoUkIE.ie_key(),
1446             }
1447
1448     def _real_extract(self, url):
1449         pid = self._match_id(url)
1450         qs = parse_qs(url)
1451         series_id = qs.get('seriesId', [None])[0]
1452         page = qs.get('page', [None])[0]
1453         per_page = 36 if page else self._PAGE_SIZE
1454         fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1455         entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1456         playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1457         return self.playlist_result(
1458             entries, pid, self._get_playlist_title(playlist_data),
1459             self._get_description(playlist_data))
1460
1461
1462 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1463     IE_NAME = 'bbc.co.uk:iplayer:episodes'
1464     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1465     _TESTS = [{
1466         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1467         'info_dict': {
1468             'id': 'b05rcz9v',
1469             'title': 'The Disappearance',
1470             'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1471         },
1472         'playlist_mincount': 8,
1473     }, {
1474         # all seasons
1475         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1476         'info_dict': {
1477             'id': 'b094m5t9',
1478             'title': 'Doctor Foster',
1479             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1480         },
1481         'playlist_mincount': 10,
1482     }, {
1483         # explicit season
1484         'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1485         'info_dict': {
1486             'id': 'b094m5t9',
1487             'title': 'Doctor Foster',
1488             'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1489         },
1490         'playlist_mincount': 5,
1491     }, {
1492         # all pages
1493         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1494         'info_dict': {
1495             'id': 'm0004c4v',
1496             'title': 'Beechgrove',
1497             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1498         },
1499         'playlist_mincount': 37,
1500     }, {
1501         # explicit page
1502         'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1503         'info_dict': {
1504             'id': 'm0004c4v',
1505             'title': 'Beechgrove',
1506             'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1507         },
1508         'playlist_mincount': 1,
1509     }]
1510     _PAGE_SIZE = 100
1511     _DESCRIPTION_KEY = 'synopsis'
1512
1513     def _get_episode_image(self, episode):
1514         return self._get_default(episode, 'image')
1515
1516     def _get_episode_field(self, episode, field):
1517         return self._get_default(episode, field)
1518
1519     @staticmethod
1520     def _get_elements(data):
1521         return data['entities']['results']
1522
1523     @staticmethod
1524     def _get_episode(element):
1525         return element.get('episode') or {}
1526
1527     def _call_api(self, pid, per_page, page=1, series_id=None):
1528         variables = {
1529             'id': pid,
1530             'page': page,
1531             'perPage': per_page,
1532         }
1533         if series_id:
1534             variables['sliceId'] = series_id
1535         return self._download_json(
1536             'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1537                 'Content-Type': 'application/json'
1538             }, data=json.dumps({
1539                 'id': '5692d93d5aac8d796a0305e895e61551',
1540                 'variables': variables,
1541             }).encode('utf-8'))['data']['programme']
1542
1543     @staticmethod
1544     def _get_playlist_data(data):
1545         return data
1546
1547     def _get_playlist_title(self, data):
1548         return self._get_default(data, 'title')
1549
1550
1551 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1552     IE_NAME = 'bbc.co.uk:iplayer:group'
1553     _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1554     _TESTS = [{
1555         # Available for over a year unlike 30 days for most other programmes
1556         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1557         'info_dict': {
1558             'id': 'p02tcc32',
1559             'title': 'Bohemian Icons',
1560             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1561         },
1562         'playlist_mincount': 10,
1563     }, {
1564         # all pages
1565         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1566         'info_dict': {
1567             'id': 'p081d7j7',
1568             'title': 'Music in Scotland',
1569             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1570         },
1571         'playlist_mincount': 47,
1572     }, {
1573         # explicit page
1574         'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1575         'info_dict': {
1576             'id': 'p081d7j7',
1577             'title': 'Music in Scotland',
1578             'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1579         },
1580         'playlist_mincount': 11,
1581     }]
1582     _PAGE_SIZE = 200
1583     _DESCRIPTION_KEY = 'synopses'
1584
1585     def _get_episode_image(self, episode):
1586         return self._get_default(episode, 'images', 'standard')
1587
1588     def _get_episode_field(self, episode, field):
1589         return episode.get(field)
1590
1591     @staticmethod
1592     def _get_elements(data):
1593         return data['elements']
1594
1595     @staticmethod
1596     def _get_episode(element):
1597         return element
1598
1599     def _call_api(self, pid, per_page, page=1, series_id=None):
1600         return self._download_json(
1601             'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1602             pid, query={
1603                 'page': page,
1604                 'per_page': per_page,
1605             })['group_episodes']
1606
1607     @staticmethod
1608     def _get_playlist_data(data):
1609         return data['group']
1610
1611     def _get_playlist_title(self, data):
1612         return data.get('title')
1613
1614
1615 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1616     IE_NAME = 'bbc.co.uk:playlist'
1617     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1618     _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1619     _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1620     _TESTS = [{
1621         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1622         'info_dict': {
1623             'id': 'b05rcz9v',
1624             'title': 'The Disappearance - Clips - BBC Four',
1625             'description': 'French thriller serial about a missing teenager.',
1626         },
1627         'playlist_mincount': 7,
1628     }, {
1629         # multipage playlist, explicit page
1630         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1631         'info_dict': {
1632             'id': 'b00mfl7n',
1633             'title': 'Frozen Planet - Clips - BBC One',
1634             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1635         },
1636         'playlist_mincount': 24,
1637     }, {
1638         # multipage playlist, all pages
1639         'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1640         'info_dict': {
1641             'id': 'b00mfl7n',
1642             'title': 'Frozen Planet - Clips - BBC One',
1643             'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1644         },
1645         'playlist_mincount': 142,
1646     }, {
1647         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1648         'only_matching': True,
1649     }, {
1650         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1651         'only_matching': True,
1652     }, {
1653         'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1654         'only_matching': True,
1655     }]
1656
1657     def _extract_title_and_description(self, webpage):
1658         title = self._og_search_title(webpage, fatal=False)
1659         description = self._og_search_description(webpage)
1660         return title, description