]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
1418a043 1import functools
254e64a2 2import itertools
1418a043 3import json
f0228f56 4import re
ac668111 5import urllib.error
6import xml.etree.ElementTree
082c6c86 7
f13b1e7d 8from .common import InfoExtractor
ac668111 9from ..compat import compat_HTTPError, compat_str, compat_urlparse
8683b4d8 10from ..utils import (
3721515b 11 ExtractorError,
1418a043 12 OnDemandPagedList,
97067db2 13 clean_html,
9fb64c04 14 dict_get,
9afa1770 15 float_or_none,
97067db2 16 get_element_by_class,
8683b4d8 17 int_or_none,
6d155707 18 js_to_json,
9afa1770
S
19 parse_duration,
20 parse_iso8601,
4dfbf869 21 parse_qs,
1bdae7d3 22 strip_or_none,
9fb64c04 23 try_get,
dab062fb 24 unescapeHTML,
1bdae7d3 25 unified_timestamp,
f0228f56 26 url_or_none,
97067db2
S
27 urlencode_postdata,
28 urljoin,
8683b4d8 29)
082c6c86 30
d12a1a47 31
f13b1e7d 32class BBCCoUkIE(InfoExtractor):
082c6c86 33 IE_NAME = 'bbc.co.uk'
2e3fd9ec 34 IE_DESC = 'BBC iPlayer'
50e93e03 35 _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
36 _VALID_URL = r'''(?x)
37 https?://
38 (?:www\.)?bbc\.co\.uk/
39 (?:
40 programmes/(?!articles/)|
41 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 42 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 43 radio/player/|
b72305f0 44 sounds/play/|
d3d45e0a 45 events/[^/]+/play/[^/]+/
f20a11ed 46 )
ded7511a 47 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 48 ''' % _ID_REGEX
bfd973ec 49 _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
082c6c86 50
97067db2
S
51 _LOGIN_URL = 'https://account.bbc.com/signin'
52 _NETRC_MACHINE = 'bbc'
53
29f7c58a 54 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
55 _MEDIA_SETS = [
26ccc68b
S
56 # Provides HQ HLS streams with even better quality that pc mediaset but fails
57 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 58 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 59 'iptv-all',
60 'pc',
d12a1a47 61 ]
a8b081a0 62
e6174ee9
S
63 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
64
2e3fd9ec
S
65 _TESTS = [
66 {
f2d0fc68 67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 68 'info_dict': {
f2d0fc68 69 'id': 'b039d07m',
b1ea6802 70 'ext': 'flv',
acc86c9a 71 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
73 },
74 'params': {
b1ea6802 75 # rtmp download
2e3fd9ec
S
76 'skip_download': True,
77 }
082c6c86 78 },
2e3fd9ec
S
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
c7f0177f
S
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
17968e44 99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 101 'duration': 5100,
2e3fd9ec
S
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
b1ea6802 107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
b1ea6802 122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
a3ef0e1c
YCH
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 137 }, {
5aa535c3 138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
139 'note': 'Audio',
140 'info_dict': {
5aa535c3 141 'id': 'p022h44j',
b1ea6802 142 'ext': 'flv',
5aa535c3
S
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
c7e67594
S
146 },
147 'params': {
b1ea6802 148 # rtmp download
c7e67594
S
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
b1ea6802 156 'ext': 'flv',
c7e67594
S
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
b1ea6802 162 # rtmp download
c7e67594
S
163 'skip_download': True,
164 }
e68ae99a
S
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
b1ea6802 178 'skip': 'geolocation',
25fa8d66
YCH
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
b1ea6802 192 'skip': 'geolocation',
54914380
S
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
5aa535c3 196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 197 'info_dict': {
5aa535c3 198 'id': 'b06rkms3',
54914380 199 'ext': 'flv',
5aa535c3
S
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
b1ea6802 207 'skip': 'Now it\'s really geo-restricted',
1ac6e794 208 }, {
067aa17e 209 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
b1ea6802 213 'ext': 'flv',
1ac6e794
S
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
b1ea6802 218 # rtmp download
1ac6e794
S
219 'skip_download': True,
220 },
b72305f0
J
221 }, {
222 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
223 'note': 'Audio',
224 'info_dict': {
225 'id': 'm0007jz9',
226 'ext': 'mp4',
227 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
228 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
229 'duration': 9840,
230 },
231 'params': {
232 # rtmp download
233 'skip_download': True,
234 }
31763975
S
235 }, {
236 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
237 'only_matching': True,
c7e67594
S
238 }, {
239 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
240 'only_matching': True,
0692ef86
S
241 }, {
242 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
243 'only_matching': True,
f20a11ed
S
244 }, {
245 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
246 'only_matching': True,
72d256c4
S
247 }, {
248 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
249 'only_matching': True,
53647dfd
S
250 }, {
251 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
252 'only_matching': True,
6f356cbb
S
253 }, {
254 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
255 'only_matching': True,
256 }, {
257 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
258 'only_matching': True,
72d256c4 259 }]
2e3fd9ec 260
52efa4b3 261 def _perform_login(self, username, password):
97067db2
S
262 login_page = self._download_webpage(
263 self._LOGIN_URL, None, 'Downloading signin page')
264
265 login_form = self._hidden_inputs(login_page)
266
267 login_form.update({
268 'username': username,
269 'password': password,
270 })
271
272 post_url = urljoin(self._LOGIN_URL, self._search_regex(
273 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
274 'post url', default=self._LOGIN_URL, group='url'))
275
276 response, urlh = self._download_webpage_handle(
277 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
278 headers={'Referer': self._LOGIN_URL})
279
280 if self._LOGIN_URL in urlh.geturl():
281 error = clean_html(get_element_by_class('form-message', response))
282 if error:
283 raise ExtractorError(
284 'Unable to login: %s' % error, expected=True)
285 raise ExtractorError('Unable to log in')
286
d12a1a47
S
287 class MediaSelectionError(Exception):
288 def __init__(self, id):
289 self.id = id
290
2e3fd9ec
S
291 def _extract_asx_playlist(self, connection, programme_id):
292 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
293 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
294
2e3fd9ec 295 def _extract_items(self, playlist):
e6174ee9
S
296 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
297
2e3fd9ec 298 def _extract_medias(self, media_selection):
29f7c58a 299 error = media_selection.get('result')
300 if error:
301 raise BBCCoUkIE.MediaSelectionError(error)
302 return media_selection.get('media') or []
2e3fd9ec
S
303
304 def _extract_connections(self, media):
29f7c58a 305 return media.get('connection') or []
2e3fd9ec 306
f13b1e7d 307 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
308 subtitles = {}
309 for connection in self._extract_connections(media):
f0228f56
S
310 cc_url = url_or_none(connection.get('href'))
311 if not cc_url:
312 continue
313 captions = self._download_xml(
314 cc_url, programme_id, 'Downloading captions', fatal=False)
f9934b96 315 if not isinstance(captions, xml.etree.ElementTree.Element):
f0228f56 316 continue
29f7c58a 317 subtitles['en'] = [
f13b1e7d
JMF
318 {
319 'url': connection.get('href'),
320 'ext': 'ttml',
321 },
f13b1e7d 322 ]
29f7c58a 323 break
2e3fd9ec 324 return subtitles
082c6c86 325
d12a1a47
S
326 def _raise_extractor_error(self, media_selection_error):
327 raise ExtractorError(
328 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
329 expected=True)
330
c056efa2 331 def _download_media_selector(self, programme_id):
d12a1a47 332 last_exception = None
29f7c58a 333 for media_set in self._MEDIA_SETS:
d12a1a47
S
334 try:
335 return self._download_media_selector_url(
29f7c58a 336 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 337 except BBCCoUkIE.MediaSelectionError as e:
d781e293 338 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
339 last_exception = e
340 continue
341 self._raise_extractor_error(e)
342 self._raise_extractor_error(last_exception)
9afa1770
S
343
344 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 345 media_selection = self._download_json(
346 url, programme_id, 'Downloading media selection JSON',
9283d4ea 347 expected_status=(403, 404))
9afa1770 348 return self._process_media_selector(media_selection, programme_id)
082c6c86 349
9afa1770 350 def _process_media_selector(self, media_selection, programme_id):
082c6c86 351 formats = []
2e3fd9ec 352 subtitles = None
b0af1215 353 urls = []
2e3fd9ec 354
c056efa2
S
355 for media in self._extract_medias(media_selection):
356 kind = media.get('kind')
a7e5f274
RA
357 if kind in ('video', 'audio'):
358 bitrate = int_or_none(media.get('bitrate'))
359 encoding = media.get('encoding')
a7e5f274
RA
360 width = int_or_none(media.get('width'))
361 height = int_or_none(media.get('height'))
362 file_size = int_or_none(media.get('media_file_size'))
363 for connection in self._extract_connections(media):
b0af1215
RA
364 href = connection.get('href')
365 if href in urls:
366 continue
367 if href:
368 urls.append(href)
a7e5f274
RA
369 conn_kind = connection.get('kind')
370 protocol = connection.get('protocol')
371 supplier = connection.get('supplier')
a7e5f274
RA
372 transfer_format = connection.get('transferFormat')
373 format_id = supplier or conn_kind or protocol
a7e5f274
RA
374 # ASX playlist
375 if supplier == 'asx':
376 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
377 formats.append({
378 'url': ref,
379 'format_id': 'ref%s_%s' % (i, format_id),
380 })
381 elif transfer_format == 'dash':
382 formats.extend(self._extract_mpd_formats(
383 href, programme_id, mpd_id=format_id, fatal=False))
384 elif transfer_format == 'hls':
50e93e03 385 # TODO: let expected_status be passed into _extract_xxx_formats() instead
386 try:
387 fmts = self._extract_m3u8_formats(
388 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
389 m3u8_id=format_id, fatal=False)
390 except ExtractorError as e:
ac668111 391 if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
50e93e03 392 and e.exc_info[1].code in (403, 404)):
393 raise
394 fmts = []
395 formats.extend(fmts)
a7e5f274
RA
396 elif transfer_format == 'hds':
397 formats.extend(self._extract_f4m_formats(
398 href, programme_id, f4m_id=format_id, fatal=False))
399 else:
29f7c58a 400 if not supplier and bitrate:
aaa42cf0 401 format_id += '-%d' % bitrate
a7e5f274
RA
402 fmt = {
403 'format_id': format_id,
404 'filesize': file_size,
405 }
406 if kind == 'video':
407 fmt.update({
408 'width': width,
409 'height': height,
6240925b 410 'tbr': bitrate,
a7e5f274
RA
411 'vcodec': encoding,
412 })
413 else:
414 fmt.update({
415 'abr': bitrate,
416 'acodec': encoding,
417 'vcodec': 'none',
418 })
1af959ef 419 if protocol in ('http', 'https'):
a7e5f274
RA
420 # Direct link
421 fmt.update({
422 'url': href,
423 })
424 elif protocol == 'rtmp':
425 application = connection.get('application', 'ondemand')
426 auth_string = connection.get('authString')
427 identifier = connection.get('identifier')
428 server = connection.get('server')
429 fmt.update({
430 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
431 'play_path': identifier,
432 'app': '%s?%s' % (application, auth_string),
433 'page_url': 'http://www.bbc.co.uk',
434 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
435 'rtmp_live': False,
436 'ext': 'flv',
437 })
964744af
S
438 else:
439 continue
a7e5f274 440 formats.append(fmt)
c056efa2 441 elif kind == 'captions':
f13b1e7d 442 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 443 return formats, subtitles
2e3fd9ec 444
ae6986fb
S
445 def _download_playlist(self, playlist_id):
446 try:
447 playlist = self._download_json(
448 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
449 playlist_id, 'Downloading playlist JSON')
c45b8741 450 formats = []
451 subtitles = {}
ae6986fb 452
c45b8741 453 for version in playlist.get('allAvailableVersions', []):
ae6986fb
S
454 smp_config = version['smpConfig']
455 title = smp_config['title']
456 description = smp_config['summary']
457 for item in smp_config['items']:
458 kind = item['kind']
40fcba5e 459 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
460 continue
461 programme_id = item.get('vpid')
d97f5cd7 462 duration = int_or_none(item.get('duration'))
c45b8741 463 version_formats, version_subtitles = self._download_media_selector(programme_id)
464 types = version['types']
465 for f in version_formats:
466 f['format_note'] = ', '.join(types)
467 if any('AudioDescribed' in x for x in types):
468 f['language_preference'] = -10
469 formats += version_formats
470 for tag, subformats in (version_subtitles or {}).items():
f304da8a 471 subtitles.setdefault(tag, []).extend(subformats)
c45b8741 472
473 return programme_id, title, description, duration, formats, subtitles
ae6986fb 474 except ExtractorError as ee:
f813928e 475 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
476 raise
477
478 # fallback to legacy playlist
9afa1770
S
479 return self._process_legacy_playlist(playlist_id)
480
481 def _process_legacy_playlist_url(self, url, display_id):
482 playlist = self._download_legacy_playlist_url(url, display_id)
483 return self._extract_from_legacy_playlist(playlist, display_id)
484
485 def _process_legacy_playlist(self, playlist_id):
486 return self._process_legacy_playlist_url(
487 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
488
489 def _download_legacy_playlist_url(self, url, playlist_id=None):
490 return self._download_xml(
491 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 492
9afa1770 493 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 494 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
495 if no_items is not None:
496 reason = no_items.get('reason')
497 if reason == 'preAvailability':
498 msg = 'Episode %s is not yet available' % playlist_id
499 elif reason == 'postAvailability':
500 msg = 'Episode %s is no longer available' % playlist_id
501 elif reason == 'noMedia':
502 msg = 'Episode %s is not currently available' % playlist_id
503 else:
504 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
505 raise ExtractorError(msg, expected=True)
506
507 for item in self._extract_items(playlist):
508 kind = item.get('kind')
40fcba5e 509 if kind not in ('programme', 'radioProgramme'):
ae6986fb 510 continue
e6174ee9
S
511 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
512 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 513 description = description_el.text if description_el is not None else None
9afa1770
S
514
515 def get_programme_id(item):
516 def get_from_attributes(item):
32759325 517 for p in ('identifier', 'group'):
9afa1770
S
518 value = item.get(p)
519 if value and re.match(r'^[pb][\da-z]{7}$', value):
520 return value
521 get_from_attributes(item)
e6174ee9 522 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
523 if mediator is not None:
524 return get_from_attributes(mediator)
525
526 programme_id = get_programme_id(item)
d97f5cd7 527 duration = int_or_none(item.get('duration'))
e6174ee9
S
528
529 if programme_id:
530 formats, subtitles = self._download_media_selector(programme_id)
531 else:
532 formats, subtitles = self._process_media_selector(item, playlist_id)
533 programme_id = playlist_id
ae6986fb
S
534
535 return programme_id, title, description, duration, formats, subtitles
536
c056efa2
S
537 def _real_extract(self, url):
538 group_id = self._match_id(url)
539
540 webpage = self._download_webpage(url, group_id, 'Downloading video page')
541
b2ed954f 542 error = self._search_regex(
29f7c58a 543 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
544 webpage, 'error', default=None)
545 if error:
546 raise ExtractorError(error, expected=True)
547
8683b4d8 548 programme_id = None
679bacf0 549 duration = None
8683b4d8
S
550
551 tviplayer = self._search_regex(
552 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
553 webpage, 'player', default=None)
554
555 if tviplayer:
556 player = self._parse_json(tviplayer, group_id).get('player', {})
557 duration = int_or_none(player.get('duration'))
558 programme_id = player.get('vpid')
559
560 if not programme_id:
561 programme_id = self._search_regex(
22d7368d 562 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 563
c056efa2 564 if programme_id:
c056efa2 565 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 566 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
567 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
568 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 569 description = self._search_regex(
a8534274
S
570 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
571 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
572 webpage, 'description', default=None)
573 if not description:
574 description = self._html_search_meta('description', webpage)
c056efa2 575 else:
ae6986fb 576 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 577
082c6c86 578 return {
2e3fd9ec 579 'id': programme_id,
082c6c86
S
580 'title': title,
581 'description': description,
650cfd0c 582 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
583 'duration': duration,
584 'formats': formats,
2e3fd9ec 585 'subtitles': subtitles,
5f6a1245 586 }
10273d6e 587
588
6368e2e6 589class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
9afa1770
S
590 IE_NAME = 'bbc'
591 IE_DESC = 'BBC'
ed13a772 592 _VALID_URL = r'''(?x)
593 https?://(?:www\.)?(?:
594 bbc\.(?:com|co\.uk)|
595 bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
596 bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
597 )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
10273d6e 598
29f7c58a 599 _MEDIA_SETS = [
29f7c58a 600 'pc',
2d997542 601 'mobile-tablet-main',
d12a1a47 602 ]
10273d6e 603
604 _TESTS = [{
6a747190 605 # article with multiple videos embedded with data-playable containing vpids
10273d6e 606 'url': 'http://www.bbc.com/news/world-europe-32668511',
607 'info_dict': {
608 'id': 'world-europe-32668511',
acc86c9a 609 'title': 'Russia stages massive WW2 parade',
9afa1770 610 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 611 },
612 'playlist_count': 2,
a3bfddfa 613 }, {
6a747190 614 # article with multiple videos embedded with data-playable (more videos)
10273d6e 615 'url': 'http://www.bbc.com/news/business-28299555',
616 'info_dict': {
617 'id': 'business-28299555',
618 'title': 'Farnborough Airshow: Video highlights',
9afa1770 619 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 620 },
621 'playlist_count': 9,
9afa1770 622 'skip': 'Save time',
88ed52ae
S
623 }, {
624 # article with multiple videos embedded with `new SMP()`
6a747190 625 # broken
88ed52ae
S
626 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
627 'info_dict': {
628 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 629 'title': 'BUGGER',
88ed52ae
S
630 },
631 'playlist_count': 18,
a3bfddfa 632 }, {
6a747190 633 # single video embedded with data-playable containing vpid
10273d6e 634 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 635 'info_dict': {
636 'id': 'p02mprgb',
55ebae26 637 'ext': 'mp4',
10273d6e 638 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 639 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 640 'duration': 47,
9afa1770 641 'timestamp': 1427219242,
da92eeae 642 'upload_date': '20150324',
10273d6e 643 },
644 'params': {
9afa1770 645 # rtmp download
10273d6e 646 'skip_download': True,
647 }
a3bfddfa 648 }, {
6a747190
S
649 # article with single video embedded with data-playable containing XML playlist
650 # with direct video links as progressiveDownloadUrl (for now these are extracted)
651 # and playlist with f4m and m3u8 as streamingUrl
de939d89 652 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 653 'info_dict': {
9afa1770 654 'id': '150615_telabyad_kentin_cogu',
de939d89 655 'ext': 'mp4',
ad152e2d 656 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 657 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 658 'timestamp': 1434397334,
da92eeae 659 'upload_date': '20150615',
de939d89 660 },
661 'params': {
662 'skip_download': True,
663 }
c936d8cc 664 }, {
6a747190 665 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 666 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 667 'info_dict': {
9afa1770 668 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 669 'ext': 'mp4',
9afa1770 670 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 671 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 672 'timestamp': 1434713142,
da92eeae 673 'upload_date': '20150619',
de939d89 674 },
675 'params': {
676 'skip_download': True,
677 }
a346b1ff
S
678 }, {
679 # single video from video playlist embedded with vxp-playlist-data JSON
680 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
681 'info_dict': {
682 'id': 'p02w6qjc',
55ebae26 683 'ext': 'mp4',
a346b1ff
S
684 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
685 'duration': 56,
0bc4ee60 686 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
687 },
688 'params': {
689 'skip_download': True,
690 }
9afa1770
S
691 }, {
692 # single video story with digitalData
693 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
694 'info_dict': {
695 'id': 'p02q6gc4',
696 'ext': 'flv',
697 'title': 'Sri Lanka’s spicy secret',
698 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
699 'timestamp': 1437674293,
700 'upload_date': '20150723',
701 },
702 'params': {
703 # rtmp download
704 'skip_download': True,
705 }
706 }, {
707 # single video story without digitalData
708 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
709 'info_dict': {
710 'id': 'p018zqqg',
55ebae26 711 'ext': 'mp4',
9afa1770
S
712 'title': 'Hyundai Santa Fe Sport: Rock star',
713 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
714 'timestamp': 1415867444,
715 'upload_date': '20141113',
9afa1770
S
716 },
717 'params': {
718 # rtmp download
719 'skip_download': True,
720 }
9fb64c04
S
721 }, {
722 # single video embedded with Morph
723 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
724 'info_dict': {
725 'id': 'p041vhd0',
726 'ext': 'mp4',
727 'title': "Nigeria v Japan - Men's First Round",
728 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
729 'duration': 7980,
730 'uploader': 'BBC Sport',
731 'uploader_id': 'bbc_sport',
732 },
733 'params': {
734 # m3u8 download
735 'skip_download': True,
9fb64c04
S
736 },
737 'skip': 'Georestricted to UK',
9afa1770 738 }, {
6a747190 739 # single video with playlist.sxml URL in playlist param
9afa1770
S
740 'url': 'http://www.bbc.com/sport/0/football/33653409',
741 'info_dict': {
742 'id': 'p02xycnp',
55ebae26 743 'ext': 'mp4',
9afa1770 744 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 745 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
746 'duration': 140,
747 },
748 'params': {
749 # rtmp download
750 'skip_download': True,
751 }
b5d48cb1 752 }, {
6a747190 753 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
754 'url': 'http://www.bbc.com/sport/0/football/34475836',
755 'info_dict': {
756 'id': '34475836',
450b233c 757 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 758 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
759 },
760 'playlist_count': 3,
450b233c
S
761 }, {
762 # school report article with single video
763 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
764 'info_dict': {
765 'id': '35744779',
766 'title': 'School which breaks down barriers in Jerusalem',
767 },
768 'playlist_count': 1,
9afa1770
S
769 }, {
770 # single video with playlist URL from weather section
771 'url': 'http://www.bbc.com/weather/features/33601775',
772 'only_matching': True,
773 }, {
774 # custom redirection to www.bbc.com
1bdae7d3 775 # also, video with window.__INITIAL_DATA__
9afa1770 776 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
1bdae7d3 777 'info_dict': {
778 'id': 'p02xzws1',
779 'ext': 'mp4',
780 'title': "Pluto may have 'nitrogen glaciers'",
781 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
782 'thumbnail': r're:https?://.+/.+\.jpg',
783 'timestamp': 1437785037,
784 'upload_date': '20150725',
785 },
50e93e03 786 }, {
787 # video with window.__INITIAL_DATA__ and value as JSON string
788 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
789 'info_dict': {
790 'id': 'p0b71qth',
791 'ext': 'mp4',
792 'title': 'Why France is making this woman a national hero',
793 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
794 'thumbnail': r're:https?://.+/.+\.jpg',
795 'timestamp': 1638230731,
796 'upload_date': '20211130',
797 },
a1cf3e38
S
798 }, {
799 # single video article embedded with data-media-vpid
800 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
801 'only_matching': True,
6d155707 802 }, {
50e93e03 803 # bbcthreeConfig
6d155707
S
804 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
805 'info_dict': {
806 'id': 'p06556y7',
807 'ext': 'mp4',
50e93e03 808 'title': 'Things Not To Say to people that live on council estates',
809 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
810 'duration': 360,
811 'thumbnail': r're:https?://.+/.+\.jpg',
6d155707 812 },
b96b4be4
RA
813 }, {
814 # window.__PRELOADED_STATE__
815 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
816 'info_dict': {
817 'id': 'b0b9z4vz',
818 'ext': 'mp4',
819 'title': 'Prom 6: An American in Paris and Turangalila',
820 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
821 'uploader': 'Radio 3',
822 'uploader_id': 'bbc_radio_three',
823 },
373941c5
S
824 }, {
825 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
826 'info_dict': {
827 'id': 'p06w9tws',
828 'ext': 'mp4',
829 'title': 'md5:2fabf12a726603193a2879a055f72514',
830 'description': 'Learn English words and phrases from this story',
831 },
832 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 833 }, {
834 # BBC Reel
835 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
836 'info_dict': {
837 'id': 'p07c6sb9',
838 'ext': 'mp4',
839 'title': 'How positive thinking is harming your happiness',
840 'alt_title': 'The downsides of positive thinking',
841 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
842 'duration': 235,
843 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
844 'upload_date': '20190604',
845 'categories': ['Psychology'],
846 },
ed13a772 847 }, { # onion routes
848 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
849 'only_matching': True,
850 }, {
851 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
852 'only_matching': True,
10273d6e 853 }]
854
9afa1770
S
855 @classmethod
856 def suitable(cls, url):
1418a043 857 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
858 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
859 else super(BBCIE, cls).suitable(url))
9afa1770
S
860
861 def _extract_from_media_meta(self, media_meta, video_id):
862 # Direct links to media in media metadata (e.g.
863 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
864 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
865 source_files = media_meta.get('sourceFiles')
866 if source_files:
867 return [{
868 'url': f['url'],
869 'format_id': format_id,
870 'ext': f.get('encoding'),
871 'tbr': float_or_none(f.get('bitrate'), 1000),
872 'filesize': int_or_none(f.get('filesize')),
873 } for format_id, f in source_files.items() if f.get('url')], []
874
875 programme_id = media_meta.get('externalId')
876 if programme_id:
877 return self._download_media_selector(programme_id)
878
879 # Process playlist.sxml as legacy playlist
880 href = media_meta.get('href')
881 if href:
882 playlist = self._download_legacy_playlist_url(href)
883 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
884 return formats, subtitles
885
886 return [], []
887
baf39a1a
S
888 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
889 programme_id, title, description, duration, formats, subtitles = \
890 self._process_legacy_playlist_url(url, playlist_id)
baf39a1a
S
891 return {
892 'id': programme_id,
893 'title': title,
894 'description': description,
895 'duration': duration,
896 'timestamp': timestamp,
897 'formats': formats,
898 'subtitles': subtitles,
899 }
900
10273d6e 901 def _real_extract(self, url):
9afa1770
S
902 playlist_id = self._match_id(url)
903
904 webpage = self._download_webpage(url, playlist_id)
905
522f6c06 906 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 907 timestamp = json_ld_info.get('timestamp')
0e832c2c 908
62b8dac4 909 playlist_title = json_ld_info.get('title') or re.sub(
910 r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
0e832c2c
S
911
912 playlist_description = json_ld_info.get(
913 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
914
915 if not timestamp:
916 timestamp = parse_iso8601(self._search_regex(
917 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
918 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 919 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 920 webpage, 'date', default=None))
9afa1770 921
78f9d843
S
922 entries = []
923
de665713
S
924 # article with multiple videos embedded with playlist.sxml (e.g.
925 # http://www.bbc.com/sport/0/football/34475836)
926 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 927 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 928 if playlists:
baf39a1a
S
929 entries = [
930 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
931 for playlist_url in playlists]
de939d89 932
78f9d843
S
933 # news article with multiple videos embedded with data-playable
934 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
935 if data_playables:
936 for _, data_playable_json in data_playables:
937 data_playable = self._parse_json(
938 unescapeHTML(data_playable_json), playlist_id, fatal=False)
939 if not data_playable:
940 continue
baf39a1a
S
941 settings = data_playable.get('settings', {})
942 if settings:
78f9d843
S
943 # data-playable with video vpid in settings.playlistObject.items (e.g.
944 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
945 playlist_object = settings.get('playlistObject', {})
946 if playlist_object:
947 items = playlist_object.get('items')
948 if items and isinstance(items, list):
78f9d843
S
949 title = playlist_object['title']
950 description = playlist_object.get('summary')
baf39a1a
S
951 duration = int_or_none(items[0].get('duration'))
952 programme_id = items[0].get('vpid')
78f9d843 953 formats, subtitles = self._download_media_selector(programme_id)
78f9d843
S
954 entries.append({
955 'id': programme_id,
956 'title': title,
957 'description': description,
958 'timestamp': timestamp,
959 'duration': duration,
960 'formats': formats,
961 'subtitles': subtitles,
962 })
963 else:
964 # data-playable without vpid but with a playlist.sxml URLs
965 # in otherSettings.playlist (e.g.
966 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
967 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
968 if playlist:
a7e5f274
RA
969 entry = None
970 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
971 playlist_url = playlist.get('%sUrl' % key)
972 if not playlist_url:
973 continue
974 try:
a7e5f274
RA
975 info = self._extract_from_playlist_sxml(
976 playlist_url, playlist_id, timestamp)
977 if not entry:
978 entry = info
979 else:
980 entry['title'] = info['title']
981 entry['formats'].extend(info['formats'])
3721515b 982 except ExtractorError as e:
05087d1b
S
983 # Some playlist URL may fail with 500, at the same time
984 # the other one may work fine (e.g.
985 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
986 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
987 continue
988 raise
a7e5f274 989 if entry:
a7e5f274 990 entries.append(entry)
78f9d843
S
991
992 if entries:
78f9d843
S
993 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
994
373941c5
S
995 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
996 group_id = self._search_regex(
997 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
998 webpage, 'group id', default=None)
38d70284 999 if group_id:
373941c5
S
1000 return self.url_result(
1001 'https://www.bbc.co.uk/programmes/%s' % group_id,
1002 ie=BBCCoUkIE.ie_key())
1003
78f9d843
S
1004 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1005 programme_id = self._search_regex(
a1cf3e38 1006 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
1007 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1008 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 1009 webpage, 'vpid', default=None)
dab062fb 1010
9afa1770
S
1011 if programme_id:
1012 formats, subtitles = self._download_media_selector(programme_id)
9afa1770
S
1013 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1014 digital_data = self._parse_json(
1015 self._search_regex(
1016 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1017 programme_id, fatal=False)
1018 page_info = digital_data.get('page', {}).get('pageInfo', {})
1019 title = page_info.get('pageName') or self._og_search_title(webpage)
1020 description = page_info.get('description') or self._og_search_description(webpage)
1021 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1022 return {
1023 'id': programme_id,
1024 'title': title,
1025 'description': description,
1026 'timestamp': timestamp,
1027 'formats': formats,
1028 'subtitles': subtitles,
1029 }
a3bfddfa 1030
3721515b 1031 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1032 initial_data = self._parse_json(self._html_search_regex(
1033 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1034 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1035 if initial_data:
1036 init_data = try_get(
1037 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1038 smp_data = init_data.get('smpData') or {}
1039 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1040 version_id = clip_data.get('versionID')
1041 if version_id:
1042 title = smp_data['title']
1043 formats, subtitles = self._download_media_selector(version_id)
3721515b 1044 image_url = smp_data.get('holdingImageURL')
1045 display_date = init_data.get('displayDate')
1046 topic_title = init_data.get('topicTitle')
1047
1048 return {
1049 'id': version_id,
1050 'title': title,
1051 'formats': formats,
1052 'alt_title': init_data.get('shortTitle'),
1053 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1054 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1055 'upload_date': display_date.replace('-', '') if display_date else None,
1056 'subtitles': subtitles,
1057 'duration': int_or_none(clip_data.get('duration')),
1058 'categories': [topic_title] if topic_title else None,
1059 }
1060
9fb64c04
S
1061 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1062 # There are several setPayload calls may be present but the video
1063 # seems to be always related to the first one
1064 morph_payload = self._parse_json(
1065 self._search_regex(
1066 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1067 webpage, 'morph payload', default='{}'),
1068 playlist_id, fatal=False)
1069 if morph_payload:
1070 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1071 for component in components:
1072 if not isinstance(component, dict):
1073 continue
1074 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1075 if not lead_media:
1076 continue
1077 identifiers = lead_media.get('identifiers')
1078 if not identifiers or not isinstance(identifiers, dict):
1079 continue
1080 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1081 if not programme_id:
1082 continue
1083 title = lead_media.get('title') or self._og_search_title(webpage)
1084 formats, subtitles = self._download_media_selector(programme_id)
9fb64c04
S
1085 description = lead_media.get('summary')
1086 uploader = lead_media.get('masterBrand')
1087 uploader_id = lead_media.get('mid')
1088 duration = None
1089 duration_d = lead_media.get('duration')
1090 if isinstance(duration_d, dict):
1091 duration = parse_duration(dict_get(
1092 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1093 return {
1094 'id': programme_id,
1095 'title': title,
1096 'description': description,
1097 'duration': duration,
1098 'uploader': uploader,
1099 'uploader_id': uploader_id,
1100 'formats': formats,
1101 'subtitles': subtitles,
1102 }
1103
b96b4be4
RA
1104 preload_state = self._parse_json(self._search_regex(
1105 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1106 'preload state', default='{}'), playlist_id, fatal=False)
1107 if preload_state:
1108 current_programme = preload_state.get('programmes', {}).get('current') or {}
1109 programme_id = current_programme.get('id')
1110 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1111 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1112 formats, subtitles = self._download_media_selector(programme_id)
b96b4be4
RA
1113 synopses = current_programme.get('synopses') or {}
1114 network = current_programme.get('network') or {}
1115 duration = int_or_none(
1116 current_programme.get('duration', {}).get('value'))
1117 thumbnail = None
1118 image_url = current_programme.get('image_url')
1119 if image_url:
3721515b 1120 thumbnail = image_url.replace('{recipe}', 'raw')
b96b4be4
RA
1121 return {
1122 'id': programme_id,
1123 'title': title,
1124 'description': dict_get(synopses, ('long', 'medium', 'short')),
1125 'thumbnail': thumbnail,
1126 'duration': duration,
1127 'uploader': network.get('short_title'),
1128 'uploader_id': network.get('id'),
1129 'formats': formats,
1130 'subtitles': subtitles,
1131 }
1132
6d155707
S
1133 bbc3_config = self._parse_json(
1134 self._search_regex(
1135 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1136 'bbcthree config', default='{}'),
38d70284 1137 playlist_id, transform_source=js_to_json, fatal=False) or {}
1138 payload = bbc3_config.get('payload') or {}
1139 if payload:
1140 clip = payload.get('currentClip') or {}
1141 clip_vpid = clip.get('vpid')
1142 clip_title = clip.get('title')
1143 if clip_vpid and clip_title:
1144 formats, subtitles = self._download_media_selector(clip_vpid)
38d70284 1145 return {
1146 'id': clip_vpid,
1147 'title': clip_title,
1148 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1149 'description': clip.get('description'),
1150 'duration': parse_duration(clip.get('duration')),
1151 'formats': formats,
1152 'subtitles': subtitles,
1153 }
6d155707 1154 bbc3_playlist = try_get(
38d70284 1155 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1156 dict)
1157 if bbc3_playlist:
1158 playlist_title = bbc3_playlist.get('title') or playlist_title
1159 thumbnail = bbc3_playlist.get('holdingImageURL')
1160 entries = []
1161 for bbc3_item in bbc3_playlist['items']:
1162 programme_id = bbc3_item.get('versionID')
1163 if not programme_id:
1164 continue
1165 formats, subtitles = self._download_media_selector(programme_id)
6d155707
S
1166 entries.append({
1167 'id': programme_id,
1168 'title': playlist_title,
1169 'thumbnail': thumbnail,
1170 'timestamp': timestamp,
1171 'formats': formats,
1172 'subtitles': subtitles,
1173 })
1174 return self.playlist_result(
1175 entries, playlist_id, playlist_title, playlist_description)
1176
50e93e03 1177 initial_data = self._search_regex(
1178 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1179 'quoted preload state', default=None)
1180 if initial_data is None:
1181 initial_data = self._search_regex(
1182 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1183 'preload state', default={})
1184 else:
1185 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1186 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
38d70284 1187 if initial_data:
1188 def parse_media(media):
1189 if not media:
1190 return
1191 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1192 item_id = item.get('id')
1193 item_title = item.get('title')
1194 if not (item_id and item_title):
1195 continue
1196 formats, subtitles = self._download_media_selector(item_id)
1bdae7d3 1197 item_desc = None
1198 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1199 if blocks:
1200 summary = []
1201 for block in blocks:
1202 text = try_get(block, lambda x: x['model']['text'], compat_str)
1203 if text:
1204 summary.append(text)
1205 if summary:
1206 item_desc = '\n\n'.join(summary)
1207 item_time = None
1208 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1209 if try_get(meta, lambda x: x['label']) == 'Published':
1210 item_time = unified_timestamp(meta.get('timestamp'))
1211 break
38d70284 1212 entries.append({
1213 'id': item_id,
1214 'title': item_title,
1215 'thumbnail': item.get('holdingImageUrl'),
1216 'formats': formats,
1217 'subtitles': subtitles,
1bdae7d3 1218 'timestamp': item_time,
1219 'description': strip_or_none(item_desc),
38d70284 1220 })
1221 for resp in (initial_data.get('data') or {}).values():
1222 name = resp.get('name')
1223 if name == 'media-experience':
1224 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1225 elif name == 'article':
50e93e03 1226 for block in (try_get(resp,
1227 (lambda x: x['data']['blocks'],
1228 lambda x: x['data']['content']['model']['blocks'],),
1229 list) or []):
edebb651 1230 if block.get('type') not in ['media', 'video']:
38d70284 1231 continue
1232 parse_media(block.get('model'))
1233 return self.playlist_result(
1234 entries, playlist_id, playlist_title, playlist_description)
1235
88ed52ae
S
1236 def extract_all(pattern):
1237 return list(filter(None, map(
1238 lambda s: self._parse_json(s, playlist_id, fatal=False),
1239 re.findall(pattern, webpage))))
1240
1241 # Multiple video article (e.g.
1242 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1243 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1244 entries = []
1245 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1246 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1247 if embed_url and re.match(EMBED_URL, embed_url):
1248 entries.append(embed_url)
1249 entries.extend(re.findall(
1250 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1251 if entries:
1252 return self.playlist_result(
aaa42cf0 1253 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1254 playlist_id, playlist_title, playlist_description)
9afa1770
S
1255
1256 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1257 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1258
1259 if not medias:
1260 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1261 media_asset = self._search_regex(
1262 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1263 webpage, 'media asset', default=None)
1264 if media_asset:
1265 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1266 medias = []
1267 for video in media_asset_page.get('videos', {}).values():
1268 medias.extend(video.values())
1269
1270 if not medias:
1271 # Multiple video playlist with single `now playing` entry (e.g.
1272 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1273 vxp_playlist = self._parse_json(
9afa1770 1274 self._search_regex(
a346b1ff
S
1275 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1276 webpage, 'playlist data'),
9afa1770 1277 playlist_id)
a346b1ff
S
1278 playlist_medias = []
1279 for item in vxp_playlist:
1280 media = item.get('media')
1281 if not media:
1282 continue
1283 playlist_medias.append(media)
1284 # Download single video if found media with asset id matching the video id from URL
1285 if item.get('advert', {}).get('assetId') == playlist_id:
1286 medias = [media]
1287 break
1288 # Fallback to the whole playlist
1289 if not medias:
1290 medias = playlist_medias
9afa1770
S
1291
1292 entries = []
1293 for num, media_meta in enumerate(medias, start=1):
1294 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
a06916d9 1295 if not formats and not self.get_param('ignore_no_formats'):
9afa1770 1296 continue
10273d6e 1297
9afa1770
S
1298 video_id = media_meta.get('externalId')
1299 if not video_id:
1300 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1301
1302 title = media_meta.get('caption')
1303 if not title:
1304 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1305
1306 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1307
9afa1770
S
1308 images = []
1309 for image in media_meta.get('images', {}).values():
1310 images.extend(image.values())
1311 if 'image' in media_meta:
1312 images.append(media_meta['image'])
1313
1314 thumbnails = [{
1315 'url': image.get('href'),
1316 'width': int_or_none(image.get('width')),
1317 'height': int_or_none(image.get('height')),
1318 } for image in images]
1319
1320 entries.append({
1321 'id': video_id,
10273d6e 1322 'title': title,
9afa1770 1323 'thumbnails': thumbnails,
10273d6e 1324 'duration': duration,
9afa1770 1325 'timestamp': timestamp,
10273d6e 1326 'formats': formats,
1327 'subtitles': subtitles,
a3bfddfa 1328 })
10273d6e 1329
9afa1770 1330 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1331
1332
1333class BBCCoUkArticleIE(InfoExtractor):
92519402 1334 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1335 IE_NAME = 'bbc.co.uk:article'
1336 IE_DESC = 'BBC articles'
1337
1338 _TEST = {
1339 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1340 'info_dict': {
1341 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1342 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1343 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1344 },
1345 'playlist_count': 4,
1346 'add_ie': ['BBCCoUk'],
1347 }
1348
1349 def _real_extract(self, url):
1350 playlist_id = self._match_id(url)
1351
1352 webpage = self._download_webpage(url, playlist_id)
1353
1354 title = self._og_search_title(webpage)
1355 description = self._og_search_description(webpage).strip()
1356
1357 entries = [self.url_result(programme_url) for programme_url in re.findall(
1358 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1359
1360 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1361
1362
1363class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1364 def _entries(self, webpage, url, playlist_id):
1365 single_page = 'page' in compat_urlparse.parse_qs(
1366 compat_urlparse.urlparse(url).query)
1367 for page_num in itertools.count(2):
1368 for video_id in re.findall(
1369 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1370 yield self.url_result(
1371 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1372 if single_page:
1373 return
1374 next_page = self._search_regex(
1375 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1376 webpage, 'next page url', default=None, group='url')
1377 if not next_page:
1378 break
1379 webpage = self._download_webpage(
1380 compat_urlparse.urljoin(url, next_page), playlist_id,
1381 'Downloading page %d' % page_num, page_num)
1382
ded7511a
S
1383 def _real_extract(self, url):
1384 playlist_id = self._match_id(url)
1385
1386 webpage = self._download_webpage(url, playlist_id)
1387
ded7511a
S
1388 title, description = self._extract_title_and_description(webpage)
1389
254e64a2
S
1390 return self.playlist_result(
1391 self._entries(webpage, url, playlist_id),
1392 playlist_id, title, description)
ded7511a
S
1393
1394
1418a043 1395class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1396 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1397
1398 @staticmethod
1399 def _get_default(episode, key, default_key='default'):
1400 return try_get(episode, lambda x: x[key][default_key])
1401
1402 def _get_description(self, data):
1403 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1404 return dict_get(synopsis, ('large', 'medium', 'small'))
1405
1406 def _fetch_page(self, programme_id, per_page, series_id, page):
1407 elements = self._get_elements(self._call_api(
1408 programme_id, per_page, page + 1, series_id))
1409 for element in elements:
1410 episode = self._get_episode(element)
1411 episode_id = episode.get('id')
1412 if not episode_id:
1413 continue
1414 thumbnail = None
1415 image = self._get_episode_image(episode)
1416 if image:
1417 thumbnail = image.replace('{recipe}', 'raw')
1418 category = self._get_default(episode, 'labels', 'category')
1419 yield {
1420 '_type': 'url',
1421 'id': episode_id,
1422 'title': self._get_episode_field(episode, 'subtitle'),
1423 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1424 'thumbnail': thumbnail,
1425 'description': self._get_description(episode),
1426 'categories': [category] if category else None,
1427 'series': self._get_episode_field(episode, 'title'),
1428 'ie_key': BBCCoUkIE.ie_key(),
1429 }
1430
1431 def _real_extract(self, url):
1432 pid = self._match_id(url)
4dfbf869 1433 qs = parse_qs(url)
1418a043 1434 series_id = qs.get('seriesId', [None])[0]
1435 page = qs.get('page', [None])[0]
1436 per_page = 36 if page else self._PAGE_SIZE
1437 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1438 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1439 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1440 return self.playlist_result(
1441 entries, pid, self._get_playlist_title(playlist_data),
1442 self._get_description(playlist_data))
1443
1444
1445class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1446 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1447 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1448 _TESTS = [{
ded7511a
S
1449 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1450 'info_dict': {
1451 'id': 'b05rcz9v',
1452 'title': 'The Disappearance',
1418a043 1453 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1454 },
1418a043 1455 'playlist_mincount': 8,
9158af16 1456 }, {
1418a043 1457 # all seasons
1458 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1459 'info_dict': {
1460 'id': 'b094m5t9',
1461 'title': 'Doctor Foster',
1462 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1463 },
1464 'playlist_mincount': 10,
1465 }, {
1466 # explicit season
1467 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1468 'info_dict': {
1469 'id': 'b094m5t9',
1470 'title': 'Doctor Foster',
1471 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1472 },
1473 'playlist_mincount': 5,
1474 }, {
1475 # all pages
1476 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1477 'info_dict': {
1478 'id': 'm0004c4v',
1479 'title': 'Beechgrove',
1480 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1481 },
1482 'playlist_mincount': 37,
1483 }, {
1484 # explicit page
1485 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1486 'info_dict': {
1487 'id': 'm0004c4v',
1488 'title': 'Beechgrove',
1489 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1490 },
1491 'playlist_mincount': 1,
1492 }]
1493 _PAGE_SIZE = 100
1494 _DESCRIPTION_KEY = 'synopsis'
1495
1496 def _get_episode_image(self, episode):
1497 return self._get_default(episode, 'image')
1498
1499 def _get_episode_field(self, episode, field):
1500 return self._get_default(episode, field)
1501
1502 @staticmethod
1503 def _get_elements(data):
1504 return data['entities']['results']
1505
1506 @staticmethod
1507 def _get_episode(element):
1508 return element.get('episode') or {}
1509
1510 def _call_api(self, pid, per_page, page=1, series_id=None):
1511 variables = {
1512 'id': pid,
1513 'page': page,
1514 'perPage': per_page,
1515 }
1516 if series_id:
1517 variables['sliceId'] = series_id
1518 return self._download_json(
1519 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1520 'Content-Type': 'application/json'
1521 }, data=json.dumps({
1522 'id': '5692d93d5aac8d796a0305e895e61551',
1523 'variables': variables,
1524 }).encode('utf-8'))['data']['programme']
1525
1526 @staticmethod
1527 def _get_playlist_data(data):
1528 return data
1529
1530 def _get_playlist_title(self, data):
1531 return self._get_default(data, 'title')
1532
1533
1534class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1535 IE_NAME = 'bbc.co.uk:iplayer:group'
1536 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1537 _TESTS = [{
9158af16
S
1538 # Available for over a year unlike 30 days for most other programmes
1539 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1540 'info_dict': {
1541 'id': 'p02tcc32',
1542 'title': 'Bohemian Icons',
1543 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1544 },
1545 'playlist_mincount': 10,
1418a043 1546 }, {
1547 # all pages
1548 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1549 'info_dict': {
1550 'id': 'p081d7j7',
1551 'title': 'Music in Scotland',
1552 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1553 },
1554 'playlist_mincount': 47,
1555 }, {
1556 # explicit page
1557 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1558 'info_dict': {
1559 'id': 'p081d7j7',
1560 'title': 'Music in Scotland',
1561 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1562 },
1563 'playlist_mincount': 11,
9158af16 1564 }]
1418a043 1565 _PAGE_SIZE = 200
1566 _DESCRIPTION_KEY = 'synopses'
1567
1568 def _get_episode_image(self, episode):
1569 return self._get_default(episode, 'images', 'standard')
1570
1571 def _get_episode_field(self, episode, field):
1572 return episode.get(field)
1573
1574 @staticmethod
1575 def _get_elements(data):
1576 return data['elements']
1577
1578 @staticmethod
1579 def _get_episode(element):
1580 return element
1581
1582 def _call_api(self, pid, per_page, page=1, series_id=None):
1583 return self._download_json(
1584 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1585 pid, query={
1586 'page': page,
1587 'per_page': per_page,
1588 })['group_episodes']
1589
1590 @staticmethod
1591 def _get_playlist_data(data):
1592 return data['group']
ded7511a 1593
1418a043 1594 def _get_playlist_title(self, data):
1595 return data.get('title')
ded7511a
S
1596
1597
1598class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1599 IE_NAME = 'bbc.co.uk:playlist'
1600 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1601 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1602 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1603 _TESTS = [{
1604 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1605 'info_dict': {
1606 'id': 'b05rcz9v',
1607 'title': 'The Disappearance - Clips - BBC Four',
1608 'description': 'French thriller serial about a missing teenager.',
1609 },
1610 'playlist_mincount': 7,
4f640f28
S
1611 }, {
1612 # multipage playlist, explicit page
1613 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1614 'info_dict': {
1615 'id': 'b00mfl7n',
1616 'title': 'Frozen Planet - Clips - BBC One',
1617 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1618 },
1619 'playlist_mincount': 24,
1620 }, {
1621 # multipage playlist, all pages
1622 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1623 'info_dict': {
1624 'id': 'b00mfl7n',
1625 'title': 'Frozen Planet - Clips - BBC One',
1626 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1627 },
1628 'playlist_mincount': 142,
ded7511a
S
1629 }, {
1630 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1631 'only_matching': True,
1632 }, {
1633 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1634 'only_matching': True,
1635 }, {
1636 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1637 'only_matching': True,
1638 }]
1639
1640 def _extract_title_and_description(self, webpage):
1641 title = self._og_search_title(webpage, fatal=False)
1642 description = self._og_search_description(webpage)
1643 return title, description