]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
1418a043 1import functools
254e64a2 2import itertools
1418a043 3import json
f0228f56 4import re
ac668111 5import urllib.error
6import xml.etree.ElementTree
082c6c86 7
f13b1e7d 8from .common import InfoExtractor
ac668111 9from ..compat import compat_HTTPError, compat_str, compat_urlparse
8683b4d8 10from ..utils import (
3721515b 11 ExtractorError,
1418a043 12 OnDemandPagedList,
97067db2 13 clean_html,
9fb64c04 14 dict_get,
9afa1770 15 float_or_none,
97067db2 16 get_element_by_class,
8683b4d8 17 int_or_none,
6d155707 18 js_to_json,
9afa1770
S
19 parse_duration,
20 parse_iso8601,
4dfbf869 21 parse_qs,
1bdae7d3 22 strip_or_none,
9fb64c04 23 try_get,
dab062fb 24 unescapeHTML,
1bdae7d3 25 unified_timestamp,
f0228f56 26 url_or_none,
97067db2
S
27 urlencode_postdata,
28 urljoin,
8683b4d8 29)
082c6c86 30
d12a1a47 31
f13b1e7d 32class BBCCoUkIE(InfoExtractor):
082c6c86 33 IE_NAME = 'bbc.co.uk'
2e3fd9ec 34 IE_DESC = 'BBC iPlayer'
50e93e03 35 _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
36 _VALID_URL = r'''(?x)
37 https?://
38 (?:www\.)?bbc\.co\.uk/
39 (?:
40 programmes/(?!articles/)|
41 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 42 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 43 radio/player/|
b72305f0 44 sounds/play/|
d3d45e0a 45 events/[^/]+/play/[^/]+/
f20a11ed 46 )
ded7511a 47 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 48 ''' % _ID_REGEX
bfd973ec 49 _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
082c6c86 50
97067db2
S
51 _LOGIN_URL = 'https://account.bbc.com/signin'
52 _NETRC_MACHINE = 'bbc'
53
29f7c58a 54 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
55 _MEDIA_SETS = [
26ccc68b
S
56 # Provides HQ HLS streams with even better quality that pc mediaset but fails
57 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 58 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 59 'iptv-all',
60 'pc',
d12a1a47 61 ]
a8b081a0 62
e6174ee9
S
63 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
64
2e3fd9ec
S
65 _TESTS = [
66 {
f2d0fc68 67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 68 'info_dict': {
f2d0fc68 69 'id': 'b039d07m',
b1ea6802 70 'ext': 'flv',
acc86c9a 71 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
73 },
74 'params': {
b1ea6802 75 # rtmp download
2e3fd9ec
S
76 'skip_download': True,
77 }
082c6c86 78 },
2e3fd9ec
S
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
c7f0177f
S
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
17968e44 99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 101 'duration': 5100,
2e3fd9ec
S
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
b1ea6802 107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
b1ea6802 122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
a3ef0e1c
YCH
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 137 }, {
5aa535c3 138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
139 'note': 'Audio',
140 'info_dict': {
5aa535c3 141 'id': 'p022h44j',
b1ea6802 142 'ext': 'flv',
5aa535c3
S
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
c7e67594
S
146 },
147 'params': {
b1ea6802 148 # rtmp download
c7e67594
S
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
b1ea6802 156 'ext': 'flv',
c7e67594
S
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
b1ea6802 162 # rtmp download
c7e67594
S
163 'skip_download': True,
164 }
e68ae99a
S
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
b1ea6802 178 'skip': 'geolocation',
25fa8d66
YCH
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
b1ea6802 192 'skip': 'geolocation',
54914380
S
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
5aa535c3 196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 197 'info_dict': {
5aa535c3 198 'id': 'b06rkms3',
54914380 199 'ext': 'flv',
5aa535c3
S
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
b1ea6802 207 'skip': 'Now it\'s really geo-restricted',
1ac6e794 208 }, {
067aa17e 209 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
b1ea6802 213 'ext': 'flv',
1ac6e794
S
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
b1ea6802 218 # rtmp download
1ac6e794
S
219 'skip_download': True,
220 },
b72305f0
J
221 }, {
222 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
223 'note': 'Audio',
224 'info_dict': {
225 'id': 'm0007jz9',
226 'ext': 'mp4',
227 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
228 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
229 'duration': 9840,
230 },
231 'params': {
232 # rtmp download
233 'skip_download': True,
234 }
31763975
S
235 }, {
236 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
237 'only_matching': True,
c7e67594
S
238 }, {
239 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
240 'only_matching': True,
0692ef86
S
241 }, {
242 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
243 'only_matching': True,
f20a11ed
S
244 }, {
245 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
246 'only_matching': True,
72d256c4
S
247 }, {
248 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
249 'only_matching': True,
53647dfd
S
250 }, {
251 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
252 'only_matching': True,
6f356cbb
S
253 }, {
254 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
255 'only_matching': True,
256 }, {
257 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
258 'only_matching': True,
72d256c4 259 }]
2e3fd9ec 260
52efa4b3 261 def _perform_login(self, username, password):
97067db2
S
262 login_page = self._download_webpage(
263 self._LOGIN_URL, None, 'Downloading signin page')
264
265 login_form = self._hidden_inputs(login_page)
266
267 login_form.update({
268 'username': username,
269 'password': password,
270 })
271
272 post_url = urljoin(self._LOGIN_URL, self._search_regex(
273 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
274 'post url', default=self._LOGIN_URL, group='url'))
275
276 response, urlh = self._download_webpage_handle(
277 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
278 headers={'Referer': self._LOGIN_URL})
279
280 if self._LOGIN_URL in urlh.geturl():
281 error = clean_html(get_element_by_class('form-message', response))
282 if error:
283 raise ExtractorError(
284 'Unable to login: %s' % error, expected=True)
285 raise ExtractorError('Unable to log in')
286
d12a1a47
S
287 class MediaSelectionError(Exception):
288 def __init__(self, id):
289 self.id = id
290
2e3fd9ec
S
291 def _extract_asx_playlist(self, connection, programme_id):
292 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
293 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
294
2e3fd9ec 295 def _extract_items(self, playlist):
e6174ee9
S
296 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
297
2e3fd9ec 298 def _extract_medias(self, media_selection):
29f7c58a 299 error = media_selection.get('result')
300 if error:
301 raise BBCCoUkIE.MediaSelectionError(error)
302 return media_selection.get('media') or []
2e3fd9ec
S
303
304 def _extract_connections(self, media):
29f7c58a 305 return media.get('connection') or []
2e3fd9ec 306
f13b1e7d 307 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
308 subtitles = {}
309 for connection in self._extract_connections(media):
f0228f56
S
310 cc_url = url_or_none(connection.get('href'))
311 if not cc_url:
312 continue
313 captions = self._download_xml(
314 cc_url, programme_id, 'Downloading captions', fatal=False)
f9934b96 315 if not isinstance(captions, xml.etree.ElementTree.Element):
f0228f56 316 continue
29f7c58a 317 subtitles['en'] = [
f13b1e7d
JMF
318 {
319 'url': connection.get('href'),
320 'ext': 'ttml',
321 },
f13b1e7d 322 ]
29f7c58a 323 break
2e3fd9ec 324 return subtitles
082c6c86 325
d12a1a47
S
326 def _raise_extractor_error(self, media_selection_error):
327 raise ExtractorError(
328 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
329 expected=True)
330
c056efa2 331 def _download_media_selector(self, programme_id):
d12a1a47 332 last_exception = None
29f7c58a 333 for media_set in self._MEDIA_SETS:
d12a1a47
S
334 try:
335 return self._download_media_selector_url(
29f7c58a 336 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 337 except BBCCoUkIE.MediaSelectionError as e:
d781e293 338 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
339 last_exception = e
340 continue
341 self._raise_extractor_error(e)
342 self._raise_extractor_error(last_exception)
9afa1770
S
343
344 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 345 media_selection = self._download_json(
346 url, programme_id, 'Downloading media selection JSON',
9283d4ea 347 expected_status=(403, 404))
9afa1770 348 return self._process_media_selector(media_selection, programme_id)
082c6c86 349
9afa1770 350 def _process_media_selector(self, media_selection, programme_id):
082c6c86 351 formats = []
2e3fd9ec 352 subtitles = None
b0af1215 353 urls = []
2e3fd9ec 354
c056efa2
S
355 for media in self._extract_medias(media_selection):
356 kind = media.get('kind')
a7e5f274
RA
357 if kind in ('video', 'audio'):
358 bitrate = int_or_none(media.get('bitrate'))
359 encoding = media.get('encoding')
a7e5f274
RA
360 width = int_or_none(media.get('width'))
361 height = int_or_none(media.get('height'))
362 file_size = int_or_none(media.get('media_file_size'))
363 for connection in self._extract_connections(media):
b0af1215
RA
364 href = connection.get('href')
365 if href in urls:
366 continue
367 if href:
368 urls.append(href)
a7e5f274
RA
369 conn_kind = connection.get('kind')
370 protocol = connection.get('protocol')
371 supplier = connection.get('supplier')
a7e5f274
RA
372 transfer_format = connection.get('transferFormat')
373 format_id = supplier or conn_kind or protocol
a7e5f274
RA
374 # ASX playlist
375 if supplier == 'asx':
376 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
377 formats.append({
378 'url': ref,
379 'format_id': 'ref%s_%s' % (i, format_id),
380 })
381 elif transfer_format == 'dash':
382 formats.extend(self._extract_mpd_formats(
383 href, programme_id, mpd_id=format_id, fatal=False))
384 elif transfer_format == 'hls':
50e93e03 385 # TODO: let expected_status be passed into _extract_xxx_formats() instead
386 try:
387 fmts = self._extract_m3u8_formats(
388 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
389 m3u8_id=format_id, fatal=False)
390 except ExtractorError as e:
ac668111 391 if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
50e93e03 392 and e.exc_info[1].code in (403, 404)):
393 raise
394 fmts = []
395 formats.extend(fmts)
a7e5f274
RA
396 elif transfer_format == 'hds':
397 formats.extend(self._extract_f4m_formats(
398 href, programme_id, f4m_id=format_id, fatal=False))
399 else:
29f7c58a 400 if not supplier and bitrate:
aaa42cf0 401 format_id += '-%d' % bitrate
a7e5f274
RA
402 fmt = {
403 'format_id': format_id,
404 'filesize': file_size,
405 }
406 if kind == 'video':
407 fmt.update({
408 'width': width,
409 'height': height,
6240925b 410 'tbr': bitrate,
a7e5f274
RA
411 'vcodec': encoding,
412 })
413 else:
414 fmt.update({
415 'abr': bitrate,
416 'acodec': encoding,
417 'vcodec': 'none',
418 })
1af959ef 419 if protocol in ('http', 'https'):
a7e5f274
RA
420 # Direct link
421 fmt.update({
422 'url': href,
423 })
424 elif protocol == 'rtmp':
425 application = connection.get('application', 'ondemand')
426 auth_string = connection.get('authString')
427 identifier = connection.get('identifier')
428 server = connection.get('server')
429 fmt.update({
430 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
431 'play_path': identifier,
432 'app': '%s?%s' % (application, auth_string),
433 'page_url': 'http://www.bbc.co.uk',
434 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
435 'rtmp_live': False,
436 'ext': 'flv',
437 })
964744af
S
438 else:
439 continue
a7e5f274 440 formats.append(fmt)
c056efa2 441 elif kind == 'captions':
f13b1e7d 442 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 443 return formats, subtitles
2e3fd9ec 444
ae6986fb
S
445 def _download_playlist(self, playlist_id):
446 try:
447 playlist = self._download_json(
448 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
449 playlist_id, 'Downloading playlist JSON')
c45b8741 450 formats = []
451 subtitles = {}
ae6986fb 452
c45b8741 453 for version in playlist.get('allAvailableVersions', []):
ae6986fb
S
454 smp_config = version['smpConfig']
455 title = smp_config['title']
456 description = smp_config['summary']
457 for item in smp_config['items']:
458 kind = item['kind']
40fcba5e 459 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
460 continue
461 programme_id = item.get('vpid')
d97f5cd7 462 duration = int_or_none(item.get('duration'))
c45b8741 463 version_formats, version_subtitles = self._download_media_selector(programme_id)
464 types = version['types']
465 for f in version_formats:
466 f['format_note'] = ', '.join(types)
467 if any('AudioDescribed' in x for x in types):
468 f['language_preference'] = -10
469 formats += version_formats
470 for tag, subformats in (version_subtitles or {}).items():
f304da8a 471 subtitles.setdefault(tag, []).extend(subformats)
c45b8741 472
473 return programme_id, title, description, duration, formats, subtitles
ae6986fb 474 except ExtractorError as ee:
f813928e 475 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
476 raise
477
478 # fallback to legacy playlist
9afa1770
S
479 return self._process_legacy_playlist(playlist_id)
480
481 def _process_legacy_playlist_url(self, url, display_id):
482 playlist = self._download_legacy_playlist_url(url, display_id)
483 return self._extract_from_legacy_playlist(playlist, display_id)
484
485 def _process_legacy_playlist(self, playlist_id):
486 return self._process_legacy_playlist_url(
487 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
488
489 def _download_legacy_playlist_url(self, url, playlist_id=None):
490 return self._download_xml(
491 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 492
9afa1770 493 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 494 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
495 if no_items is not None:
496 reason = no_items.get('reason')
497 if reason == 'preAvailability':
498 msg = 'Episode %s is not yet available' % playlist_id
499 elif reason == 'postAvailability':
500 msg = 'Episode %s is no longer available' % playlist_id
501 elif reason == 'noMedia':
502 msg = 'Episode %s is not currently available' % playlist_id
503 else:
504 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
505 raise ExtractorError(msg, expected=True)
506
507 for item in self._extract_items(playlist):
508 kind = item.get('kind')
40fcba5e 509 if kind not in ('programme', 'radioProgramme'):
ae6986fb 510 continue
e6174ee9
S
511 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
512 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 513 description = description_el.text if description_el is not None else None
9afa1770
S
514
515 def get_programme_id(item):
516 def get_from_attributes(item):
32759325 517 for p in ('identifier', 'group'):
9afa1770
S
518 value = item.get(p)
519 if value and re.match(r'^[pb][\da-z]{7}$', value):
520 return value
521 get_from_attributes(item)
e6174ee9 522 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
523 if mediator is not None:
524 return get_from_attributes(mediator)
525
526 programme_id = get_programme_id(item)
d97f5cd7 527 duration = int_or_none(item.get('duration'))
e6174ee9
S
528
529 if programme_id:
530 formats, subtitles = self._download_media_selector(programme_id)
531 else:
532 formats, subtitles = self._process_media_selector(item, playlist_id)
533 programme_id = playlist_id
ae6986fb
S
534
535 return programme_id, title, description, duration, formats, subtitles
536
c056efa2
S
537 def _real_extract(self, url):
538 group_id = self._match_id(url)
539
540 webpage = self._download_webpage(url, group_id, 'Downloading video page')
541
b2ed954f 542 error = self._search_regex(
29f7c58a 543 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
544 webpage, 'error', default=None)
545 if error:
546 raise ExtractorError(error, expected=True)
547
8683b4d8 548 programme_id = None
679bacf0 549 duration = None
8683b4d8
S
550
551 tviplayer = self._search_regex(
552 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
553 webpage, 'player', default=None)
554
555 if tviplayer:
556 player = self._parse_json(tviplayer, group_id).get('player', {})
557 duration = int_or_none(player.get('duration'))
558 programme_id = player.get('vpid')
559
560 if not programme_id:
561 programme_id = self._search_regex(
22d7368d 562 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 563
c056efa2 564 if programme_id:
c056efa2 565 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 566 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
567 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
568 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 569 description = self._search_regex(
a8534274
S
570 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
571 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
572 webpage, 'description', default=None)
573 if not description:
574 description = self._html_search_meta('description', webpage)
c056efa2 575 else:
ae6986fb 576 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 577
082c6c86
S
578 self._sort_formats(formats)
579
580 return {
2e3fd9ec 581 'id': programme_id,
082c6c86
S
582 'title': title,
583 'description': description,
650cfd0c 584 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
585 'duration': duration,
586 'formats': formats,
2e3fd9ec 587 'subtitles': subtitles,
5f6a1245 588 }
10273d6e 589
590
9afa1770
S
591class BBCIE(BBCCoUkIE):
592 IE_NAME = 'bbc'
593 IE_DESC = 'BBC'
594 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 595
29f7c58a 596 _MEDIA_SETS = [
29f7c58a 597 'pc',
2d997542 598 'mobile-tablet-main',
d12a1a47 599 ]
10273d6e 600
601 _TESTS = [{
6a747190 602 # article with multiple videos embedded with data-playable containing vpids
10273d6e 603 'url': 'http://www.bbc.com/news/world-europe-32668511',
604 'info_dict': {
605 'id': 'world-europe-32668511',
acc86c9a 606 'title': 'Russia stages massive WW2 parade',
9afa1770 607 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 608 },
609 'playlist_count': 2,
a3bfddfa 610 }, {
6a747190 611 # article with multiple videos embedded with data-playable (more videos)
10273d6e 612 'url': 'http://www.bbc.com/news/business-28299555',
613 'info_dict': {
614 'id': 'business-28299555',
615 'title': 'Farnborough Airshow: Video highlights',
9afa1770 616 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 617 },
618 'playlist_count': 9,
9afa1770 619 'skip': 'Save time',
88ed52ae
S
620 }, {
621 # article with multiple videos embedded with `new SMP()`
6a747190 622 # broken
88ed52ae
S
623 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
624 'info_dict': {
625 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 626 'title': 'BUGGER',
88ed52ae
S
627 },
628 'playlist_count': 18,
a3bfddfa 629 }, {
6a747190 630 # single video embedded with data-playable containing vpid
10273d6e 631 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 632 'info_dict': {
633 'id': 'p02mprgb',
55ebae26 634 'ext': 'mp4',
10273d6e 635 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 636 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 637 'duration': 47,
9afa1770 638 'timestamp': 1427219242,
da92eeae 639 'upload_date': '20150324',
10273d6e 640 },
641 'params': {
9afa1770 642 # rtmp download
10273d6e 643 'skip_download': True,
644 }
a3bfddfa 645 }, {
6a747190
S
646 # article with single video embedded with data-playable containing XML playlist
647 # with direct video links as progressiveDownloadUrl (for now these are extracted)
648 # and playlist with f4m and m3u8 as streamingUrl
de939d89 649 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 650 'info_dict': {
9afa1770 651 'id': '150615_telabyad_kentin_cogu',
de939d89 652 'ext': 'mp4',
ad152e2d 653 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 654 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 655 'timestamp': 1434397334,
da92eeae 656 'upload_date': '20150615',
de939d89 657 },
658 'params': {
659 'skip_download': True,
660 }
c936d8cc 661 }, {
6a747190 662 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 663 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 664 'info_dict': {
9afa1770 665 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 666 'ext': 'mp4',
9afa1770 667 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 668 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 669 'timestamp': 1434713142,
da92eeae 670 'upload_date': '20150619',
de939d89 671 },
672 'params': {
673 'skip_download': True,
674 }
a346b1ff
S
675 }, {
676 # single video from video playlist embedded with vxp-playlist-data JSON
677 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
678 'info_dict': {
679 'id': 'p02w6qjc',
55ebae26 680 'ext': 'mp4',
a346b1ff
S
681 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
682 'duration': 56,
0bc4ee60 683 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
684 },
685 'params': {
686 'skip_download': True,
687 }
9afa1770
S
688 }, {
689 # single video story with digitalData
690 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
691 'info_dict': {
692 'id': 'p02q6gc4',
693 'ext': 'flv',
694 'title': 'Sri Lanka’s spicy secret',
695 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
696 'timestamp': 1437674293,
697 'upload_date': '20150723',
698 },
699 'params': {
700 # rtmp download
701 'skip_download': True,
702 }
703 }, {
704 # single video story without digitalData
705 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
706 'info_dict': {
707 'id': 'p018zqqg',
55ebae26 708 'ext': 'mp4',
9afa1770
S
709 'title': 'Hyundai Santa Fe Sport: Rock star',
710 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
711 'timestamp': 1415867444,
712 'upload_date': '20141113',
9afa1770
S
713 },
714 'params': {
715 # rtmp download
716 'skip_download': True,
717 }
9fb64c04
S
718 }, {
719 # single video embedded with Morph
720 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
721 'info_dict': {
722 'id': 'p041vhd0',
723 'ext': 'mp4',
724 'title': "Nigeria v Japan - Men's First Round",
725 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
726 'duration': 7980,
727 'uploader': 'BBC Sport',
728 'uploader_id': 'bbc_sport',
729 },
730 'params': {
731 # m3u8 download
732 'skip_download': True,
9fb64c04
S
733 },
734 'skip': 'Georestricted to UK',
9afa1770 735 }, {
6a747190 736 # single video with playlist.sxml URL in playlist param
9afa1770
S
737 'url': 'http://www.bbc.com/sport/0/football/33653409',
738 'info_dict': {
739 'id': 'p02xycnp',
55ebae26 740 'ext': 'mp4',
9afa1770 741 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 742 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
743 'duration': 140,
744 },
745 'params': {
746 # rtmp download
747 'skip_download': True,
748 }
b5d48cb1 749 }, {
6a747190 750 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
751 'url': 'http://www.bbc.com/sport/0/football/34475836',
752 'info_dict': {
753 'id': '34475836',
450b233c 754 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 755 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
756 },
757 'playlist_count': 3,
450b233c
S
758 }, {
759 # school report article with single video
760 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
761 'info_dict': {
762 'id': '35744779',
763 'title': 'School which breaks down barriers in Jerusalem',
764 },
765 'playlist_count': 1,
9afa1770
S
766 }, {
767 # single video with playlist URL from weather section
768 'url': 'http://www.bbc.com/weather/features/33601775',
769 'only_matching': True,
770 }, {
771 # custom redirection to www.bbc.com
1bdae7d3 772 # also, video with window.__INITIAL_DATA__
9afa1770 773 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
1bdae7d3 774 'info_dict': {
775 'id': 'p02xzws1',
776 'ext': 'mp4',
777 'title': "Pluto may have 'nitrogen glaciers'",
778 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
779 'thumbnail': r're:https?://.+/.+\.jpg',
780 'timestamp': 1437785037,
781 'upload_date': '20150725',
782 },
50e93e03 783 }, {
784 # video with window.__INITIAL_DATA__ and value as JSON string
785 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
786 'info_dict': {
787 'id': 'p0b71qth',
788 'ext': 'mp4',
789 'title': 'Why France is making this woman a national hero',
790 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
791 'thumbnail': r're:https?://.+/.+\.jpg',
792 'timestamp': 1638230731,
793 'upload_date': '20211130',
794 },
a1cf3e38
S
795 }, {
796 # single video article embedded with data-media-vpid
797 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
798 'only_matching': True,
6d155707 799 }, {
50e93e03 800 # bbcthreeConfig
6d155707
S
801 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
802 'info_dict': {
803 'id': 'p06556y7',
804 'ext': 'mp4',
50e93e03 805 'title': 'Things Not To Say to people that live on council estates',
806 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
807 'duration': 360,
808 'thumbnail': r're:https?://.+/.+\.jpg',
6d155707 809 },
b96b4be4
RA
810 }, {
811 # window.__PRELOADED_STATE__
812 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
813 'info_dict': {
814 'id': 'b0b9z4vz',
815 'ext': 'mp4',
816 'title': 'Prom 6: An American in Paris and Turangalila',
817 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
818 'uploader': 'Radio 3',
819 'uploader_id': 'bbc_radio_three',
820 },
373941c5
S
821 }, {
822 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
823 'info_dict': {
824 'id': 'p06w9tws',
825 'ext': 'mp4',
826 'title': 'md5:2fabf12a726603193a2879a055f72514',
827 'description': 'Learn English words and phrases from this story',
828 },
829 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 830 }, {
831 # BBC Reel
832 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
833 'info_dict': {
834 'id': 'p07c6sb9',
835 'ext': 'mp4',
836 'title': 'How positive thinking is harming your happiness',
837 'alt_title': 'The downsides of positive thinking',
838 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
839 'duration': 235,
840 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
841 'upload_date': '20190604',
842 'categories': ['Psychology'],
843 },
10273d6e 844 }]
845
9afa1770
S
846 @classmethod
847 def suitable(cls, url):
1418a043 848 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
849 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
850 else super(BBCIE, cls).suitable(url))
9afa1770
S
851
852 def _extract_from_media_meta(self, media_meta, video_id):
853 # Direct links to media in media metadata (e.g.
854 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
855 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
856 source_files = media_meta.get('sourceFiles')
857 if source_files:
858 return [{
859 'url': f['url'],
860 'format_id': format_id,
861 'ext': f.get('encoding'),
862 'tbr': float_or_none(f.get('bitrate'), 1000),
863 'filesize': int_or_none(f.get('filesize')),
864 } for format_id, f in source_files.items() if f.get('url')], []
865
866 programme_id = media_meta.get('externalId')
867 if programme_id:
868 return self._download_media_selector(programme_id)
869
870 # Process playlist.sxml as legacy playlist
871 href = media_meta.get('href')
872 if href:
873 playlist = self._download_legacy_playlist_url(href)
874 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
875 return formats, subtitles
876
877 return [], []
878
baf39a1a
S
879 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
880 programme_id, title, description, duration, formats, subtitles = \
881 self._process_legacy_playlist_url(url, playlist_id)
882 self._sort_formats(formats)
883 return {
884 'id': programme_id,
885 'title': title,
886 'description': description,
887 'duration': duration,
888 'timestamp': timestamp,
889 'formats': formats,
890 'subtitles': subtitles,
891 }
892
10273d6e 893 def _real_extract(self, url):
9afa1770
S
894 playlist_id = self._match_id(url)
895
896 webpage = self._download_webpage(url, playlist_id)
897
522f6c06 898 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 899 timestamp = json_ld_info.get('timestamp')
0e832c2c 900
350e02d4 901 playlist_title = json_ld_info.get('title')
0e832c2c 902 if not playlist_title:
04f3fd2c 903 playlist_title = (self._og_search_title(webpage, default=None)
904 or self._html_extract_title(webpage, 'playlist title', default=None))
0e832c2c
S
905 if playlist_title:
906 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
907
908 playlist_description = json_ld_info.get(
909 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
910
911 if not timestamp:
912 timestamp = parse_iso8601(self._search_regex(
913 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
914 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 915 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 916 webpage, 'date', default=None))
9afa1770 917
78f9d843
S
918 entries = []
919
de665713
S
920 # article with multiple videos embedded with playlist.sxml (e.g.
921 # http://www.bbc.com/sport/0/football/34475836)
922 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 923 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 924 if playlists:
baf39a1a
S
925 entries = [
926 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
927 for playlist_url in playlists]
de939d89 928
78f9d843
S
929 # news article with multiple videos embedded with data-playable
930 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
931 if data_playables:
932 for _, data_playable_json in data_playables:
933 data_playable = self._parse_json(
934 unescapeHTML(data_playable_json), playlist_id, fatal=False)
935 if not data_playable:
936 continue
baf39a1a
S
937 settings = data_playable.get('settings', {})
938 if settings:
78f9d843
S
939 # data-playable with video vpid in settings.playlistObject.items (e.g.
940 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
941 playlist_object = settings.get('playlistObject', {})
942 if playlist_object:
943 items = playlist_object.get('items')
944 if items and isinstance(items, list):
78f9d843
S
945 title = playlist_object['title']
946 description = playlist_object.get('summary')
baf39a1a
S
947 duration = int_or_none(items[0].get('duration'))
948 programme_id = items[0].get('vpid')
78f9d843
S
949 formats, subtitles = self._download_media_selector(programme_id)
950 self._sort_formats(formats)
951 entries.append({
952 'id': programme_id,
953 'title': title,
954 'description': description,
955 'timestamp': timestamp,
956 'duration': duration,
957 'formats': formats,
958 'subtitles': subtitles,
959 })
960 else:
961 # data-playable without vpid but with a playlist.sxml URLs
962 # in otherSettings.playlist (e.g.
963 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
964 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
965 if playlist:
a7e5f274
RA
966 entry = None
967 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
968 playlist_url = playlist.get('%sUrl' % key)
969 if not playlist_url:
970 continue
971 try:
a7e5f274
RA
972 info = self._extract_from_playlist_sxml(
973 playlist_url, playlist_id, timestamp)
974 if not entry:
975 entry = info
976 else:
977 entry['title'] = info['title']
978 entry['formats'].extend(info['formats'])
3721515b 979 except ExtractorError as e:
05087d1b
S
980 # Some playlist URL may fail with 500, at the same time
981 # the other one may work fine (e.g.
982 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
983 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
984 continue
985 raise
a7e5f274
RA
986 if entry:
987 self._sort_formats(entry['formats'])
988 entries.append(entry)
78f9d843
S
989
990 if entries:
78f9d843
S
991 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
992
373941c5
S
993 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
994 group_id = self._search_regex(
995 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
996 webpage, 'group id', default=None)
38d70284 997 if group_id:
373941c5
S
998 return self.url_result(
999 'https://www.bbc.co.uk/programmes/%s' % group_id,
1000 ie=BBCCoUkIE.ie_key())
1001
78f9d843
S
1002 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1003 programme_id = self._search_regex(
a1cf3e38 1004 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
1005 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1006 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 1007 webpage, 'vpid', default=None)
dab062fb 1008
9afa1770
S
1009 if programme_id:
1010 formats, subtitles = self._download_media_selector(programme_id)
1011 self._sort_formats(formats)
1012 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1013 digital_data = self._parse_json(
1014 self._search_regex(
1015 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1016 programme_id, fatal=False)
1017 page_info = digital_data.get('page', {}).get('pageInfo', {})
1018 title = page_info.get('pageName') or self._og_search_title(webpage)
1019 description = page_info.get('description') or self._og_search_description(webpage)
1020 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1021 return {
1022 'id': programme_id,
1023 'title': title,
1024 'description': description,
1025 'timestamp': timestamp,
1026 'formats': formats,
1027 'subtitles': subtitles,
1028 }
a3bfddfa 1029
3721515b 1030 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1031 initial_data = self._parse_json(self._html_search_regex(
1032 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1033 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1034 if initial_data:
1035 init_data = try_get(
1036 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1037 smp_data = init_data.get('smpData') or {}
1038 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1039 version_id = clip_data.get('versionID')
1040 if version_id:
1041 title = smp_data['title']
1042 formats, subtitles = self._download_media_selector(version_id)
1043 self._sort_formats(formats)
1044 image_url = smp_data.get('holdingImageURL')
1045 display_date = init_data.get('displayDate')
1046 topic_title = init_data.get('topicTitle')
1047
1048 return {
1049 'id': version_id,
1050 'title': title,
1051 'formats': formats,
1052 'alt_title': init_data.get('shortTitle'),
1053 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1054 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1055 'upload_date': display_date.replace('-', '') if display_date else None,
1056 'subtitles': subtitles,
1057 'duration': int_or_none(clip_data.get('duration')),
1058 'categories': [topic_title] if topic_title else None,
1059 }
1060
9fb64c04
S
1061 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1062 # There are several setPayload calls may be present but the video
1063 # seems to be always related to the first one
1064 morph_payload = self._parse_json(
1065 self._search_regex(
1066 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1067 webpage, 'morph payload', default='{}'),
1068 playlist_id, fatal=False)
1069 if morph_payload:
1070 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1071 for component in components:
1072 if not isinstance(component, dict):
1073 continue
1074 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1075 if not lead_media:
1076 continue
1077 identifiers = lead_media.get('identifiers')
1078 if not identifiers or not isinstance(identifiers, dict):
1079 continue
1080 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1081 if not programme_id:
1082 continue
1083 title = lead_media.get('title') or self._og_search_title(webpage)
1084 formats, subtitles = self._download_media_selector(programme_id)
1085 self._sort_formats(formats)
1086 description = lead_media.get('summary')
1087 uploader = lead_media.get('masterBrand')
1088 uploader_id = lead_media.get('mid')
1089 duration = None
1090 duration_d = lead_media.get('duration')
1091 if isinstance(duration_d, dict):
1092 duration = parse_duration(dict_get(
1093 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1094 return {
1095 'id': programme_id,
1096 'title': title,
1097 'description': description,
1098 'duration': duration,
1099 'uploader': uploader,
1100 'uploader_id': uploader_id,
1101 'formats': formats,
1102 'subtitles': subtitles,
1103 }
1104
b96b4be4
RA
1105 preload_state = self._parse_json(self._search_regex(
1106 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1107 'preload state', default='{}'), playlist_id, fatal=False)
1108 if preload_state:
1109 current_programme = preload_state.get('programmes', {}).get('current') or {}
1110 programme_id = current_programme.get('id')
1111 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1112 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1113 formats, subtitles = self._download_media_selector(programme_id)
1114 self._sort_formats(formats)
1115 synopses = current_programme.get('synopses') or {}
1116 network = current_programme.get('network') or {}
1117 duration = int_or_none(
1118 current_programme.get('duration', {}).get('value'))
1119 thumbnail = None
1120 image_url = current_programme.get('image_url')
1121 if image_url:
3721515b 1122 thumbnail = image_url.replace('{recipe}', 'raw')
b96b4be4
RA
1123 return {
1124 'id': programme_id,
1125 'title': title,
1126 'description': dict_get(synopses, ('long', 'medium', 'short')),
1127 'thumbnail': thumbnail,
1128 'duration': duration,
1129 'uploader': network.get('short_title'),
1130 'uploader_id': network.get('id'),
1131 'formats': formats,
1132 'subtitles': subtitles,
1133 }
1134
6d155707
S
1135 bbc3_config = self._parse_json(
1136 self._search_regex(
1137 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1138 'bbcthree config', default='{}'),
38d70284 1139 playlist_id, transform_source=js_to_json, fatal=False) or {}
1140 payload = bbc3_config.get('payload') or {}
1141 if payload:
1142 clip = payload.get('currentClip') or {}
1143 clip_vpid = clip.get('vpid')
1144 clip_title = clip.get('title')
1145 if clip_vpid and clip_title:
1146 formats, subtitles = self._download_media_selector(clip_vpid)
1147 self._sort_formats(formats)
1148 return {
1149 'id': clip_vpid,
1150 'title': clip_title,
1151 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1152 'description': clip.get('description'),
1153 'duration': parse_duration(clip.get('duration')),
1154 'formats': formats,
1155 'subtitles': subtitles,
1156 }
6d155707 1157 bbc3_playlist = try_get(
38d70284 1158 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1159 dict)
1160 if bbc3_playlist:
1161 playlist_title = bbc3_playlist.get('title') or playlist_title
1162 thumbnail = bbc3_playlist.get('holdingImageURL')
1163 entries = []
1164 for bbc3_item in bbc3_playlist['items']:
1165 programme_id = bbc3_item.get('versionID')
1166 if not programme_id:
1167 continue
1168 formats, subtitles = self._download_media_selector(programme_id)
1169 self._sort_formats(formats)
1170 entries.append({
1171 'id': programme_id,
1172 'title': playlist_title,
1173 'thumbnail': thumbnail,
1174 'timestamp': timestamp,
1175 'formats': formats,
1176 'subtitles': subtitles,
1177 })
1178 return self.playlist_result(
1179 entries, playlist_id, playlist_title, playlist_description)
1180
50e93e03 1181 initial_data = self._search_regex(
1182 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1183 'quoted preload state', default=None)
1184 if initial_data is None:
1185 initial_data = self._search_regex(
1186 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1187 'preload state', default={})
1188 else:
1189 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1190 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
38d70284 1191 if initial_data:
1192 def parse_media(media):
1193 if not media:
1194 return
1195 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1196 item_id = item.get('id')
1197 item_title = item.get('title')
1198 if not (item_id and item_title):
1199 continue
1200 formats, subtitles = self._download_media_selector(item_id)
1201 self._sort_formats(formats)
1bdae7d3 1202 item_desc = None
1203 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1204 if blocks:
1205 summary = []
1206 for block in blocks:
1207 text = try_get(block, lambda x: x['model']['text'], compat_str)
1208 if text:
1209 summary.append(text)
1210 if summary:
1211 item_desc = '\n\n'.join(summary)
1212 item_time = None
1213 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1214 if try_get(meta, lambda x: x['label']) == 'Published':
1215 item_time = unified_timestamp(meta.get('timestamp'))
1216 break
38d70284 1217 entries.append({
1218 'id': item_id,
1219 'title': item_title,
1220 'thumbnail': item.get('holdingImageUrl'),
1221 'formats': formats,
1222 'subtitles': subtitles,
1bdae7d3 1223 'timestamp': item_time,
1224 'description': strip_or_none(item_desc),
38d70284 1225 })
1226 for resp in (initial_data.get('data') or {}).values():
1227 name = resp.get('name')
1228 if name == 'media-experience':
1229 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1230 elif name == 'article':
50e93e03 1231 for block in (try_get(resp,
1232 (lambda x: x['data']['blocks'],
1233 lambda x: x['data']['content']['model']['blocks'],),
1234 list) or []):
edebb651 1235 if block.get('type') not in ['media', 'video']:
38d70284 1236 continue
1237 parse_media(block.get('model'))
1238 return self.playlist_result(
1239 entries, playlist_id, playlist_title, playlist_description)
1240
88ed52ae
S
1241 def extract_all(pattern):
1242 return list(filter(None, map(
1243 lambda s: self._parse_json(s, playlist_id, fatal=False),
1244 re.findall(pattern, webpage))))
1245
1246 # Multiple video article (e.g.
1247 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1248 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1249 entries = []
1250 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1251 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1252 if embed_url and re.match(EMBED_URL, embed_url):
1253 entries.append(embed_url)
1254 entries.extend(re.findall(
1255 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1256 if entries:
1257 return self.playlist_result(
aaa42cf0 1258 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1259 playlist_id, playlist_title, playlist_description)
9afa1770
S
1260
1261 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1262 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1263
1264 if not medias:
1265 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1266 media_asset = self._search_regex(
1267 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1268 webpage, 'media asset', default=None)
1269 if media_asset:
1270 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1271 medias = []
1272 for video in media_asset_page.get('videos', {}).values():
1273 medias.extend(video.values())
1274
1275 if not medias:
1276 # Multiple video playlist with single `now playing` entry (e.g.
1277 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1278 vxp_playlist = self._parse_json(
9afa1770 1279 self._search_regex(
a346b1ff
S
1280 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1281 webpage, 'playlist data'),
9afa1770 1282 playlist_id)
a346b1ff
S
1283 playlist_medias = []
1284 for item in vxp_playlist:
1285 media = item.get('media')
1286 if not media:
1287 continue
1288 playlist_medias.append(media)
1289 # Download single video if found media with asset id matching the video id from URL
1290 if item.get('advert', {}).get('assetId') == playlist_id:
1291 medias = [media]
1292 break
1293 # Fallback to the whole playlist
1294 if not medias:
1295 medias = playlist_medias
9afa1770
S
1296
1297 entries = []
1298 for num, media_meta in enumerate(medias, start=1):
1299 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
a06916d9 1300 if not formats and not self.get_param('ignore_no_formats'):
9afa1770 1301 continue
10273d6e 1302 self._sort_formats(formats)
1303
9afa1770
S
1304 video_id = media_meta.get('externalId')
1305 if not video_id:
1306 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1307
1308 title = media_meta.get('caption')
1309 if not title:
1310 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1311
1312 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1313
9afa1770
S
1314 images = []
1315 for image in media_meta.get('images', {}).values():
1316 images.extend(image.values())
1317 if 'image' in media_meta:
1318 images.append(media_meta['image'])
1319
1320 thumbnails = [{
1321 'url': image.get('href'),
1322 'width': int_or_none(image.get('width')),
1323 'height': int_or_none(image.get('height')),
1324 } for image in images]
1325
1326 entries.append({
1327 'id': video_id,
10273d6e 1328 'title': title,
9afa1770 1329 'thumbnails': thumbnails,
10273d6e 1330 'duration': duration,
9afa1770 1331 'timestamp': timestamp,
10273d6e 1332 'formats': formats,
1333 'subtitles': subtitles,
a3bfddfa 1334 })
10273d6e 1335
9afa1770 1336 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1337
1338
1339class BBCCoUkArticleIE(InfoExtractor):
92519402 1340 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1341 IE_NAME = 'bbc.co.uk:article'
1342 IE_DESC = 'BBC articles'
1343
1344 _TEST = {
1345 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1346 'info_dict': {
1347 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1348 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1349 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1350 },
1351 'playlist_count': 4,
1352 'add_ie': ['BBCCoUk'],
1353 }
1354
1355 def _real_extract(self, url):
1356 playlist_id = self._match_id(url)
1357
1358 webpage = self._download_webpage(url, playlist_id)
1359
1360 title = self._og_search_title(webpage)
1361 description = self._og_search_description(webpage).strip()
1362
1363 entries = [self.url_result(programme_url) for programme_url in re.findall(
1364 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1365
1366 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1367
1368
1369class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1370 def _entries(self, webpage, url, playlist_id):
1371 single_page = 'page' in compat_urlparse.parse_qs(
1372 compat_urlparse.urlparse(url).query)
1373 for page_num in itertools.count(2):
1374 for video_id in re.findall(
1375 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1376 yield self.url_result(
1377 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1378 if single_page:
1379 return
1380 next_page = self._search_regex(
1381 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1382 webpage, 'next page url', default=None, group='url')
1383 if not next_page:
1384 break
1385 webpage = self._download_webpage(
1386 compat_urlparse.urljoin(url, next_page), playlist_id,
1387 'Downloading page %d' % page_num, page_num)
1388
ded7511a
S
1389 def _real_extract(self, url):
1390 playlist_id = self._match_id(url)
1391
1392 webpage = self._download_webpage(url, playlist_id)
1393
ded7511a
S
1394 title, description = self._extract_title_and_description(webpage)
1395
254e64a2
S
1396 return self.playlist_result(
1397 self._entries(webpage, url, playlist_id),
1398 playlist_id, title, description)
ded7511a
S
1399
1400
1418a043 1401class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1402 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1403
1404 @staticmethod
1405 def _get_default(episode, key, default_key='default'):
1406 return try_get(episode, lambda x: x[key][default_key])
1407
1408 def _get_description(self, data):
1409 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1410 return dict_get(synopsis, ('large', 'medium', 'small'))
1411
1412 def _fetch_page(self, programme_id, per_page, series_id, page):
1413 elements = self._get_elements(self._call_api(
1414 programme_id, per_page, page + 1, series_id))
1415 for element in elements:
1416 episode = self._get_episode(element)
1417 episode_id = episode.get('id')
1418 if not episode_id:
1419 continue
1420 thumbnail = None
1421 image = self._get_episode_image(episode)
1422 if image:
1423 thumbnail = image.replace('{recipe}', 'raw')
1424 category = self._get_default(episode, 'labels', 'category')
1425 yield {
1426 '_type': 'url',
1427 'id': episode_id,
1428 'title': self._get_episode_field(episode, 'subtitle'),
1429 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1430 'thumbnail': thumbnail,
1431 'description': self._get_description(episode),
1432 'categories': [category] if category else None,
1433 'series': self._get_episode_field(episode, 'title'),
1434 'ie_key': BBCCoUkIE.ie_key(),
1435 }
1436
1437 def _real_extract(self, url):
1438 pid = self._match_id(url)
4dfbf869 1439 qs = parse_qs(url)
1418a043 1440 series_id = qs.get('seriesId', [None])[0]
1441 page = qs.get('page', [None])[0]
1442 per_page = 36 if page else self._PAGE_SIZE
1443 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1444 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1445 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1446 return self.playlist_result(
1447 entries, pid, self._get_playlist_title(playlist_data),
1448 self._get_description(playlist_data))
1449
1450
1451class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1452 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1453 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1454 _TESTS = [{
ded7511a
S
1455 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1456 'info_dict': {
1457 'id': 'b05rcz9v',
1458 'title': 'The Disappearance',
1418a043 1459 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1460 },
1418a043 1461 'playlist_mincount': 8,
9158af16 1462 }, {
1418a043 1463 # all seasons
1464 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1465 'info_dict': {
1466 'id': 'b094m5t9',
1467 'title': 'Doctor Foster',
1468 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1469 },
1470 'playlist_mincount': 10,
1471 }, {
1472 # explicit season
1473 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1474 'info_dict': {
1475 'id': 'b094m5t9',
1476 'title': 'Doctor Foster',
1477 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1478 },
1479 'playlist_mincount': 5,
1480 }, {
1481 # all pages
1482 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1483 'info_dict': {
1484 'id': 'm0004c4v',
1485 'title': 'Beechgrove',
1486 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1487 },
1488 'playlist_mincount': 37,
1489 }, {
1490 # explicit page
1491 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1492 'info_dict': {
1493 'id': 'm0004c4v',
1494 'title': 'Beechgrove',
1495 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1496 },
1497 'playlist_mincount': 1,
1498 }]
1499 _PAGE_SIZE = 100
1500 _DESCRIPTION_KEY = 'synopsis'
1501
1502 def _get_episode_image(self, episode):
1503 return self._get_default(episode, 'image')
1504
1505 def _get_episode_field(self, episode, field):
1506 return self._get_default(episode, field)
1507
1508 @staticmethod
1509 def _get_elements(data):
1510 return data['entities']['results']
1511
1512 @staticmethod
1513 def _get_episode(element):
1514 return element.get('episode') or {}
1515
1516 def _call_api(self, pid, per_page, page=1, series_id=None):
1517 variables = {
1518 'id': pid,
1519 'page': page,
1520 'perPage': per_page,
1521 }
1522 if series_id:
1523 variables['sliceId'] = series_id
1524 return self._download_json(
1525 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1526 'Content-Type': 'application/json'
1527 }, data=json.dumps({
1528 'id': '5692d93d5aac8d796a0305e895e61551',
1529 'variables': variables,
1530 }).encode('utf-8'))['data']['programme']
1531
1532 @staticmethod
1533 def _get_playlist_data(data):
1534 return data
1535
1536 def _get_playlist_title(self, data):
1537 return self._get_default(data, 'title')
1538
1539
1540class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1541 IE_NAME = 'bbc.co.uk:iplayer:group'
1542 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1543 _TESTS = [{
9158af16
S
1544 # Available for over a year unlike 30 days for most other programmes
1545 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1546 'info_dict': {
1547 'id': 'p02tcc32',
1548 'title': 'Bohemian Icons',
1549 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1550 },
1551 'playlist_mincount': 10,
1418a043 1552 }, {
1553 # all pages
1554 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1555 'info_dict': {
1556 'id': 'p081d7j7',
1557 'title': 'Music in Scotland',
1558 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1559 },
1560 'playlist_mincount': 47,
1561 }, {
1562 # explicit page
1563 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1564 'info_dict': {
1565 'id': 'p081d7j7',
1566 'title': 'Music in Scotland',
1567 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1568 },
1569 'playlist_mincount': 11,
9158af16 1570 }]
1418a043 1571 _PAGE_SIZE = 200
1572 _DESCRIPTION_KEY = 'synopses'
1573
1574 def _get_episode_image(self, episode):
1575 return self._get_default(episode, 'images', 'standard')
1576
1577 def _get_episode_field(self, episode, field):
1578 return episode.get(field)
1579
1580 @staticmethod
1581 def _get_elements(data):
1582 return data['elements']
1583
1584 @staticmethod
1585 def _get_episode(element):
1586 return element
1587
1588 def _call_api(self, pid, per_page, page=1, series_id=None):
1589 return self._download_json(
1590 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1591 pid, query={
1592 'page': page,
1593 'per_page': per_page,
1594 })['group_episodes']
1595
1596 @staticmethod
1597 def _get_playlist_data(data):
1598 return data['group']
ded7511a 1599
1418a043 1600 def _get_playlist_title(self, data):
1601 return data.get('title')
ded7511a
S
1602
1603
1604class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1605 IE_NAME = 'bbc.co.uk:playlist'
1606 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1607 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1608 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1609 _TESTS = [{
1610 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1611 'info_dict': {
1612 'id': 'b05rcz9v',
1613 'title': 'The Disappearance - Clips - BBC Four',
1614 'description': 'French thriller serial about a missing teenager.',
1615 },
1616 'playlist_mincount': 7,
4f640f28
S
1617 }, {
1618 # multipage playlist, explicit page
1619 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1620 'info_dict': {
1621 'id': 'b00mfl7n',
1622 'title': 'Frozen Planet - Clips - BBC One',
1623 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1624 },
1625 'playlist_mincount': 24,
1626 }, {
1627 # multipage playlist, all pages
1628 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1629 'info_dict': {
1630 'id': 'b00mfl7n',
1631 'title': 'Frozen Planet - Clips - BBC One',
1632 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1633 },
1634 'playlist_mincount': 142,
ded7511a
S
1635 }, {
1636 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1637 'only_matching': True,
1638 }, {
1639 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1640 'only_matching': True,
1641 }, {
1642 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1643 'only_matching': True,
1644 }]
1645
1646 def _extract_title_and_description(self, webpage):
1647 title = self._og_search_title(webpage, fatal=False)
1648 description = self._og_search_description(webpage)
1649 return title, description