]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
1418a043 1import functools
254e64a2 2import itertools
1418a043 3import json
f0228f56 4import re
ac668111 5import xml.etree.ElementTree
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
3d2623a8 8from ..compat import compat_str, compat_urlparse
9from ..networking.exceptions import HTTPError
8683b4d8 10from ..utils import (
3721515b 11 ExtractorError,
1418a043 12 OnDemandPagedList,
97067db2 13 clean_html,
9fb64c04 14 dict_get,
9afa1770 15 float_or_none,
97067db2 16 get_element_by_class,
8683b4d8 17 int_or_none,
eda0e415 18 join_nonempty,
6d155707 19 js_to_json,
9afa1770
S
20 parse_duration,
21 parse_iso8601,
4dfbf869 22 parse_qs,
1bdae7d3 23 strip_or_none,
eda0e415 24 traverse_obj,
9fb64c04 25 try_get,
dab062fb 26 unescapeHTML,
1bdae7d3 27 unified_timestamp,
f0228f56 28 url_or_none,
97067db2
S
29 urlencode_postdata,
30 urljoin,
8683b4d8 31)
082c6c86 32
d12a1a47 33
f13b1e7d 34class BBCCoUkIE(InfoExtractor):
082c6c86 35 IE_NAME = 'bbc.co.uk'
2e3fd9ec 36 IE_DESC = 'BBC iPlayer'
50e93e03 37 _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
38 _VALID_URL = r'''(?x)
39 https?://
40 (?:www\.)?bbc\.co\.uk/
41 (?:
42 programmes/(?!articles/)|
43 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 44 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
45 radio/player/|
46 events/[^/]+/play/[^/]+/
f20a11ed 47 )
ded7511a 48 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 49 ''' % _ID_REGEX
bfd973ec 50 _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
082c6c86 51
97067db2
S
52 _LOGIN_URL = 'https://account.bbc.com/signin'
53 _NETRC_MACHINE = 'bbc'
54
29f7c58a 55 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
56 _MEDIA_SETS = [
26ccc68b
S
57 # Provides HQ HLS streams with even better quality that pc mediaset but fails
58 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 59 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 60 'iptv-all',
61 'pc',
d12a1a47 62 ]
a8b081a0 63
e6174ee9
S
64 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
65
2e3fd9ec
S
66 _TESTS = [
67 {
f2d0fc68 68 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 69 'info_dict': {
f2d0fc68 70 'id': 'b039d07m',
b1ea6802 71 'ext': 'flv',
acc86c9a 72 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 73 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
74 },
75 'params': {
b1ea6802 76 # rtmp download
2e3fd9ec
S
77 'skip_download': True,
78 }
082c6c86 79 },
2e3fd9ec
S
80 {
81 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
82 'info_dict': {
83 'id': 'b00yng1d',
84 'ext': 'flv',
85 'title': 'The Man in Black: Series 3: The Printed Name',
86 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
87 'duration': 1800,
88 },
89 'params': {
90 # rtmp download
91 'skip_download': True,
c7f0177f
S
92 },
93 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
94 },
95 {
96 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
97 'info_dict': {
98 'id': 'b00yng1d',
99 'ext': 'flv',
17968e44 100 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 101 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 102 'duration': 5100,
2e3fd9ec
S
103 },
104 'params': {
105 # rtmp download
106 'skip_download': True,
107 },
b1ea6802 108 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
109 },
110 {
111 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
112 'info_dict': {
113 'id': 'b03k3pb7',
114 'ext': 'flv',
115 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
116 'description': '2. Invasion',
117 'duration': 3600,
118 },
119 'params': {
120 # rtmp download
121 'skip_download': True,
122 },
b1ea6802 123 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
124 }, {
125 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
126 'info_dict': {
127 'id': 'b04v209v',
128 'ext': 'flv',
129 'title': 'Pete Tong, The Essential New Tune Special',
130 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
131 'duration': 10800,
132 },
133 'params': {
134 # rtmp download
135 'skip_download': True,
a3ef0e1c
YCH
136 },
137 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 138 }, {
5aa535c3 139 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
140 'note': 'Audio',
141 'info_dict': {
5aa535c3 142 'id': 'p022h44j',
b1ea6802 143 'ext': 'flv',
5aa535c3
S
144 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
145 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
146 'duration': 227,
c7e67594
S
147 },
148 'params': {
b1ea6802 149 # rtmp download
c7e67594
S
150 'skip_download': True,
151 }
152 }, {
153 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
154 'note': 'Video',
155 'info_dict': {
156 'id': 'p025c103',
b1ea6802 157 'ext': 'flv',
c7e67594
S
158 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
159 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
160 'duration': 226,
161 },
162 'params': {
b1ea6802 163 # rtmp download
c7e67594
S
164 'skip_download': True,
165 }
e68ae99a
S
166 }, {
167 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
168 'info_dict': {
169 'id': 'p02n76xf',
170 'ext': 'flv',
171 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
172 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
173 'duration': 3540,
174 },
175 'params': {
176 # rtmp download
177 'skip_download': True,
178 },
b1ea6802 179 'skip': 'geolocation',
25fa8d66
YCH
180 }, {
181 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
182 'info_dict': {
183 'id': 'b05zmgw1',
184 'ext': 'flv',
185 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
186 'title': 'Royal Academy Summer Exhibition',
187 'duration': 3540,
188 },
189 'params': {
190 # rtmp download
191 'skip_download': True,
192 },
b1ea6802 193 'skip': 'geolocation',
54914380
S
194 }, {
195 # iptv-all mediaset fails with geolocation however there is no geo restriction
196 # for this programme at all
5aa535c3 197 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 198 'info_dict': {
5aa535c3 199 'id': 'b06rkms3',
54914380 200 'ext': 'flv',
5aa535c3
S
201 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
202 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
203 },
204 'params': {
205 # rtmp download
206 'skip_download': True,
207 },
b1ea6802 208 'skip': 'Now it\'s really geo-restricted',
1ac6e794 209 }, {
067aa17e 210 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
211 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
212 'info_dict': {
213 'id': 'p028bfkj',
b1ea6802 214 'ext': 'flv',
1ac6e794
S
215 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
217 },
218 'params': {
b1ea6802 219 # rtmp download
1ac6e794
S
220 'skip_download': True,
221 },
31763975
S
222 }, {
223 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
224 'only_matching': True,
c7e67594
S
225 }, {
226 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
227 'only_matching': True,
0692ef86
S
228 }, {
229 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
230 'only_matching': True,
f20a11ed
S
231 }, {
232 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
233 'only_matching': True,
72d256c4
S
234 }, {
235 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
236 'only_matching': True,
53647dfd
S
237 }, {
238 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
239 'only_matching': True,
6f356cbb
S
240 }, {
241 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
242 'only_matching': True,
243 }, {
244 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
245 'only_matching': True,
72d256c4 246 }]
2e3fd9ec 247
52efa4b3 248 def _perform_login(self, username, password):
97067db2
S
249 login_page = self._download_webpage(
250 self._LOGIN_URL, None, 'Downloading signin page')
251
252 login_form = self._hidden_inputs(login_page)
253
254 login_form.update({
255 'username': username,
256 'password': password,
257 })
258
259 post_url = urljoin(self._LOGIN_URL, self._search_regex(
260 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
261 'post url', default=self._LOGIN_URL, group='url'))
262
263 response, urlh = self._download_webpage_handle(
264 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
265 headers={'Referer': self._LOGIN_URL})
266
3d2623a8 267 if self._LOGIN_URL in urlh.url:
97067db2
S
268 error = clean_html(get_element_by_class('form-message', response))
269 if error:
270 raise ExtractorError(
271 'Unable to login: %s' % error, expected=True)
272 raise ExtractorError('Unable to log in')
273
d12a1a47
S
274 class MediaSelectionError(Exception):
275 def __init__(self, id):
276 self.id = id
277
2e3fd9ec
S
278 def _extract_asx_playlist(self, connection, programme_id):
279 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
280 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
281
2e3fd9ec 282 def _extract_items(self, playlist):
e6174ee9
S
283 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
284
2e3fd9ec 285 def _extract_medias(self, media_selection):
29f7c58a 286 error = media_selection.get('result')
287 if error:
288 raise BBCCoUkIE.MediaSelectionError(error)
289 return media_selection.get('media') or []
2e3fd9ec
S
290
291 def _extract_connections(self, media):
29f7c58a 292 return media.get('connection') or []
2e3fd9ec 293
f13b1e7d 294 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
295 subtitles = {}
296 for connection in self._extract_connections(media):
f0228f56
S
297 cc_url = url_or_none(connection.get('href'))
298 if not cc_url:
299 continue
300 captions = self._download_xml(
301 cc_url, programme_id, 'Downloading captions', fatal=False)
f9934b96 302 if not isinstance(captions, xml.etree.ElementTree.Element):
f0228f56 303 continue
29f7c58a 304 subtitles['en'] = [
f13b1e7d
JMF
305 {
306 'url': connection.get('href'),
307 'ext': 'ttml',
308 },
f13b1e7d 309 ]
29f7c58a 310 break
2e3fd9ec 311 return subtitles
082c6c86 312
d12a1a47
S
313 def _raise_extractor_error(self, media_selection_error):
314 raise ExtractorError(
315 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
316 expected=True)
317
c056efa2 318 def _download_media_selector(self, programme_id):
d12a1a47 319 last_exception = None
c919b68f 320 formats, subtitles = [], {}
29f7c58a 321 for media_set in self._MEDIA_SETS:
d12a1a47 322 try:
c919b68f 323 fmts, subs = self._download_media_selector_url(
29f7c58a 324 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
c919b68f 325 formats.extend(fmts)
326 if subs:
327 self._merge_subtitles(subs, target=subtitles)
d12a1a47 328 except BBCCoUkIE.MediaSelectionError as e:
d781e293 329 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
330 last_exception = e
331 continue
332 self._raise_extractor_error(e)
c919b68f 333 if last_exception:
334 if formats or subtitles:
335 self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
336 else:
337 self._raise_extractor_error(last_exception)
338 return formats, subtitles
9afa1770
S
339
340 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 341 media_selection = self._download_json(
342 url, programme_id, 'Downloading media selection JSON',
9283d4ea 343 expected_status=(403, 404))
9afa1770 344 return self._process_media_selector(media_selection, programme_id)
082c6c86 345
9afa1770 346 def _process_media_selector(self, media_selection, programme_id):
082c6c86 347 formats = []
2e3fd9ec 348 subtitles = None
b0af1215 349 urls = []
2e3fd9ec 350
c056efa2
S
351 for media in self._extract_medias(media_selection):
352 kind = media.get('kind')
a7e5f274
RA
353 if kind in ('video', 'audio'):
354 bitrate = int_or_none(media.get('bitrate'))
355 encoding = media.get('encoding')
a7e5f274
RA
356 width = int_or_none(media.get('width'))
357 height = int_or_none(media.get('height'))
358 file_size = int_or_none(media.get('media_file_size'))
359 for connection in self._extract_connections(media):
b0af1215
RA
360 href = connection.get('href')
361 if href in urls:
362 continue
363 if href:
364 urls.append(href)
a7e5f274
RA
365 conn_kind = connection.get('kind')
366 protocol = connection.get('protocol')
367 supplier = connection.get('supplier')
a7e5f274
RA
368 transfer_format = connection.get('transferFormat')
369 format_id = supplier or conn_kind or protocol
a7e5f274
RA
370 # ASX playlist
371 if supplier == 'asx':
372 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
373 formats.append({
374 'url': ref,
375 'format_id': 'ref%s_%s' % (i, format_id),
376 })
377 elif transfer_format == 'dash':
378 formats.extend(self._extract_mpd_formats(
379 href, programme_id, mpd_id=format_id, fatal=False))
380 elif transfer_format == 'hls':
50e93e03 381 # TODO: let expected_status be passed into _extract_xxx_formats() instead
382 try:
383 fmts = self._extract_m3u8_formats(
384 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
385 m3u8_id=format_id, fatal=False)
386 except ExtractorError as e:
3d2623a8 387 if not (isinstance(e.exc_info[1], HTTPError)
388 and e.exc_info[1].status in (403, 404)):
50e93e03 389 raise
390 fmts = []
391 formats.extend(fmts)
a7e5f274
RA
392 elif transfer_format == 'hds':
393 formats.extend(self._extract_f4m_formats(
394 href, programme_id, f4m_id=format_id, fatal=False))
395 else:
29f7c58a 396 if not supplier and bitrate:
aaa42cf0 397 format_id += '-%d' % bitrate
a7e5f274
RA
398 fmt = {
399 'format_id': format_id,
400 'filesize': file_size,
401 }
402 if kind == 'video':
403 fmt.update({
404 'width': width,
405 'height': height,
6240925b 406 'tbr': bitrate,
a7e5f274
RA
407 'vcodec': encoding,
408 })
409 else:
410 fmt.update({
411 'abr': bitrate,
412 'acodec': encoding,
413 'vcodec': 'none',
414 })
1af959ef 415 if protocol in ('http', 'https'):
a7e5f274
RA
416 # Direct link
417 fmt.update({
418 'url': href,
419 })
420 elif protocol == 'rtmp':
421 application = connection.get('application', 'ondemand')
422 auth_string = connection.get('authString')
423 identifier = connection.get('identifier')
424 server = connection.get('server')
425 fmt.update({
426 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
427 'play_path': identifier,
428 'app': '%s?%s' % (application, auth_string),
429 'page_url': 'http://www.bbc.co.uk',
430 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
431 'rtmp_live': False,
432 'ext': 'flv',
433 })
964744af
S
434 else:
435 continue
a7e5f274 436 formats.append(fmt)
c056efa2 437 elif kind == 'captions':
f13b1e7d 438 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 439 return formats, subtitles
2e3fd9ec 440
ae6986fb
S
441 def _download_playlist(self, playlist_id):
442 try:
443 playlist = self._download_json(
444 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
445 playlist_id, 'Downloading playlist JSON')
c45b8741 446 formats = []
447 subtitles = {}
ae6986fb 448
c45b8741 449 for version in playlist.get('allAvailableVersions', []):
ae6986fb
S
450 smp_config = version['smpConfig']
451 title = smp_config['title']
452 description = smp_config['summary']
453 for item in smp_config['items']:
454 kind = item['kind']
40fcba5e 455 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
456 continue
457 programme_id = item.get('vpid')
d97f5cd7 458 duration = int_or_none(item.get('duration'))
c45b8741 459 version_formats, version_subtitles = self._download_media_selector(programme_id)
460 types = version['types']
461 for f in version_formats:
462 f['format_note'] = ', '.join(types)
463 if any('AudioDescribed' in x for x in types):
464 f['language_preference'] = -10
465 formats += version_formats
466 for tag, subformats in (version_subtitles or {}).items():
f304da8a 467 subtitles.setdefault(tag, []).extend(subformats)
c45b8741 468
469 return programme_id, title, description, duration, formats, subtitles
ae6986fb 470 except ExtractorError as ee:
3d2623a8 471 if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
ae6986fb
S
472 raise
473
474 # fallback to legacy playlist
9afa1770
S
475 return self._process_legacy_playlist(playlist_id)
476
477 def _process_legacy_playlist_url(self, url, display_id):
478 playlist = self._download_legacy_playlist_url(url, display_id)
479 return self._extract_from_legacy_playlist(playlist, display_id)
480
481 def _process_legacy_playlist(self, playlist_id):
482 return self._process_legacy_playlist_url(
483 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
484
485 def _download_legacy_playlist_url(self, url, playlist_id=None):
486 return self._download_xml(
487 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 488
9afa1770 489 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 490 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
491 if no_items is not None:
492 reason = no_items.get('reason')
493 if reason == 'preAvailability':
494 msg = 'Episode %s is not yet available' % playlist_id
495 elif reason == 'postAvailability':
496 msg = 'Episode %s is no longer available' % playlist_id
497 elif reason == 'noMedia':
498 msg = 'Episode %s is not currently available' % playlist_id
499 else:
500 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
501 raise ExtractorError(msg, expected=True)
502
503 for item in self._extract_items(playlist):
504 kind = item.get('kind')
40fcba5e 505 if kind not in ('programme', 'radioProgramme'):
ae6986fb 506 continue
e6174ee9
S
507 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
508 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 509 description = description_el.text if description_el is not None else None
9afa1770
S
510
511 def get_programme_id(item):
512 def get_from_attributes(item):
32759325 513 for p in ('identifier', 'group'):
9afa1770
S
514 value = item.get(p)
515 if value and re.match(r'^[pb][\da-z]{7}$', value):
516 return value
517 get_from_attributes(item)
e6174ee9 518 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
519 if mediator is not None:
520 return get_from_attributes(mediator)
521
522 programme_id = get_programme_id(item)
d97f5cd7 523 duration = int_or_none(item.get('duration'))
e6174ee9
S
524
525 if programme_id:
526 formats, subtitles = self._download_media_selector(programme_id)
527 else:
528 formats, subtitles = self._process_media_selector(item, playlist_id)
529 programme_id = playlist_id
ae6986fb
S
530
531 return programme_id, title, description, duration, formats, subtitles
532
c056efa2
S
533 def _real_extract(self, url):
534 group_id = self._match_id(url)
535
536 webpage = self._download_webpage(url, group_id, 'Downloading video page')
537
b2ed954f 538 error = self._search_regex(
29f7c58a 539 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
540 webpage, 'error', default=None)
541 if error:
542 raise ExtractorError(error, expected=True)
543
8683b4d8 544 programme_id = None
679bacf0 545 duration = None
8683b4d8
S
546
547 tviplayer = self._search_regex(
548 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
549 webpage, 'player', default=None)
550
551 if tviplayer:
552 player = self._parse_json(tviplayer, group_id).get('player', {})
553 duration = int_or_none(player.get('duration'))
554 programme_id = player.get('vpid')
555
556 if not programme_id:
557 programme_id = self._search_regex(
22d7368d 558 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 559
c056efa2 560 if programme_id:
c056efa2 561 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 562 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
563 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
564 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 565 description = self._search_regex(
a8534274
S
566 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
567 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
568 webpage, 'description', default=None)
569 if not description:
570 description = self._html_search_meta('description', webpage)
c056efa2 571 else:
ae6986fb 572 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 573
082c6c86 574 return {
2e3fd9ec 575 'id': programme_id,
082c6c86
S
576 'title': title,
577 'description': description,
650cfd0c 578 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
579 'duration': duration,
580 'formats': formats,
2e3fd9ec 581 'subtitles': subtitles,
5f6a1245 582 }
10273d6e 583
584
6368e2e6 585class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
9afa1770
S
586 IE_NAME = 'bbc'
587 IE_DESC = 'BBC'
ed13a772 588 _VALID_URL = r'''(?x)
589 https?://(?:www\.)?(?:
590 bbc\.(?:com|co\.uk)|
591 bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
592 bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
593 )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
10273d6e 594
29f7c58a 595 _MEDIA_SETS = [
29f7c58a 596 'pc',
2d997542 597 'mobile-tablet-main',
d12a1a47 598 ]
10273d6e 599
600 _TESTS = [{
6a747190 601 # article with multiple videos embedded with data-playable containing vpids
10273d6e 602 'url': 'http://www.bbc.com/news/world-europe-32668511',
603 'info_dict': {
604 'id': 'world-europe-32668511',
7975ddf2 605 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 606 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 607 },
608 'playlist_count': 2,
a3bfddfa 609 }, {
6a747190 610 # article with multiple videos embedded with data-playable (more videos)
10273d6e 611 'url': 'http://www.bbc.com/news/business-28299555',
612 'info_dict': {
613 'id': 'business-28299555',
614 'title': 'Farnborough Airshow: Video highlights',
9afa1770 615 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 616 },
617 'playlist_count': 9,
9afa1770 618 'skip': 'Save time',
88ed52ae
S
619 }, {
620 # article with multiple videos embedded with `new SMP()`
6a747190 621 # broken
88ed52ae
S
622 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
623 'info_dict': {
624 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 625 'title': 'BUGGER',
7975ddf2 626 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
88ed52ae
S
627 },
628 'playlist_count': 18,
a3bfddfa 629 }, {
6a747190 630 # single video embedded with data-playable containing vpid
10273d6e 631 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 632 'info_dict': {
633 'id': 'p02mprgb',
55ebae26 634 'ext': 'mp4',
7975ddf2 635 'title': 'Germanwings crash site aerial video',
636 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
10273d6e 637 'duration': 47,
9afa1770 638 'timestamp': 1427219242,
da92eeae 639 'upload_date': '20150324',
7975ddf2 640 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
10273d6e 641 },
642 'params': {
643 'skip_download': True,
644 }
a3bfddfa 645 }, {
6a747190
S
646 # article with single video embedded with data-playable containing XML playlist
647 # with direct video links as progressiveDownloadUrl (for now these are extracted)
648 # and playlist with f4m and m3u8 as streamingUrl
de939d89 649 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 650 'info_dict': {
9afa1770 651 'id': '150615_telabyad_kentin_cogu',
de939d89 652 'ext': 'mp4',
ad152e2d 653 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 654 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 655 'timestamp': 1434397334,
da92eeae 656 'upload_date': '20150615',
de939d89 657 },
658 'params': {
659 'skip_download': True,
7975ddf2 660 },
661 'skip': 'now SIMORGH_DATA with no video',
c936d8cc 662 }, {
6a747190 663 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 664 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 665 'info_dict': {
7975ddf2 666 'id': '39275083',
667 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 668 'ext': 'mp4',
9afa1770 669 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
7975ddf2 670 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
9afa1770 671 'timestamp': 1434713142,
da92eeae 672 'upload_date': '20150619',
7975ddf2 673 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
de939d89 674 },
675 'params': {
676 'skip_download': True,
7975ddf2 677 },
a346b1ff
S
678 }, {
679 # single video from video playlist embedded with vxp-playlist-data JSON
680 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
681 'info_dict': {
682 'id': 'p02w6qjc',
55ebae26 683 'ext': 'mp4',
a346b1ff
S
684 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
685 'duration': 56,
0bc4ee60 686 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
687 },
688 'params': {
689 'skip_download': True,
7975ddf2 690 },
691 'skip': '404 Not Found',
9afa1770 692 }, {
7975ddf2 693 # single video story with __PWA_PRELOADED_STATE__
9afa1770
S
694 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
695 'info_dict': {
696 'id': 'p02q6gc4',
7975ddf2 697 'ext': 'mp4',
698 'title': 'Tasting the spice of life in Jaffna',
699 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
700 'timestamp': 1646058397,
701 'upload_date': '20220228',
702 'duration': 255,
703 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
9afa1770 704 },
9afa1770
S
705 }, {
706 # single video story without digitalData
707 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
708 'info_dict': {
709 'id': 'p018zqqg',
55ebae26 710 'ext': 'mp4',
9afa1770
S
711 'title': 'Hyundai Santa Fe Sport: Rock star',
712 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
713 'timestamp': 1415867444,
714 'upload_date': '20141113',
9afa1770 715 },
7975ddf2 716 'skip': 'redirects to TopGear home page',
9fb64c04
S
717 }, {
718 # single video embedded with Morph
7975ddf2 719 # TODO: replacement test page
9fb64c04
S
720 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
721 'info_dict': {
722 'id': 'p041vhd0',
723 'ext': 'mp4',
724 'title': "Nigeria v Japan - Men's First Round",
725 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
726 'duration': 7980,
727 'uploader': 'BBC Sport',
728 'uploader_id': 'bbc_sport',
729 },
7975ddf2 730 'skip': 'Video no longer in page',
9afa1770 731 }, {
7975ddf2 732 # single video in __INITIAL_DATA__
9afa1770
S
733 'url': 'http://www.bbc.com/sport/0/football/33653409',
734 'info_dict': {
735 'id': 'p02xycnp',
55ebae26 736 'ext': 'mp4',
7975ddf2 737 'title': 'Ronaldo to Man Utd, Arsenal to spend?',
738 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
739 'timestamp': 1437750175,
740 'upload_date': '20150724',
741 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
9afa1770
S
742 'duration': 140,
743 },
b5d48cb1 744 }, {
7975ddf2 745 # article with multiple videos embedded with Morph.setPayload
b5d48cb1
S
746 'url': 'http://www.bbc.com/sport/0/football/34475836',
747 'info_dict': {
748 'id': '34475836',
450b233c 749 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 750 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
751 },
752 'playlist_count': 3,
7975ddf2 753 }, {
754 # Testing noplaylist
755 'url': 'http://www.bbc.com/sport/0/football/34475836',
756 'info_dict': {
757 'id': 'p034ppnv',
758 'ext': 'mp4',
759 'title': 'All you need to know about Jurgen Klopp',
760 'timestamp': 1444335081,
761 'upload_date': '20151008',
762 'duration': 122.0,
763 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
764 },
765 'params': {
766 'noplaylist': True,
767 },
450b233c
S
768 }, {
769 # school report article with single video
770 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
771 'info_dict': {
772 'id': '35744779',
773 'title': 'School which breaks down barriers in Jerusalem',
774 },
775 'playlist_count': 1,
7975ddf2 776 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
9afa1770
S
777 }, {
778 # single video with playlist URL from weather section
779 'url': 'http://www.bbc.com/weather/features/33601775',
780 'only_matching': True,
781 }, {
782 # custom redirection to www.bbc.com
1bdae7d3 783 # also, video with window.__INITIAL_DATA__
9afa1770 784 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
1bdae7d3 785 'info_dict': {
786 'id': 'p02xzws1',
787 'ext': 'mp4',
788 'title': "Pluto may have 'nitrogen glaciers'",
789 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
790 'thumbnail': r're:https?://.+/.+\.jpg',
791 'timestamp': 1437785037,
792 'upload_date': '20150725',
7975ddf2 793 'duration': 105,
1bdae7d3 794 },
50e93e03 795 }, {
796 # video with window.__INITIAL_DATA__ and value as JSON string
797 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
798 'info_dict': {
7975ddf2 799 'id': 'p0b779gc',
50e93e03 800 'ext': 'mp4',
801 'title': 'Why France is making this woman a national hero',
7975ddf2 802 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
50e93e03 803 'thumbnail': r're:https?://.+/.+\.jpg',
7975ddf2 804 'timestamp': 1638215626,
805 'upload_date': '20211129',
806 'duration': 125,
807 },
808 }, {
809 # video with script id __NEXT_DATA__ and value as JSON string
810 'url': 'https://www.bbc.com/news/uk-68546268',
811 'info_dict': {
812 'id': 'p0hj0lq7',
813 'ext': 'mp4',
814 'title': 'Nasser Hospital doctor describes his treatment by IDF',
815 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
816 'thumbnail': r're:https?://.+/.+\.jpg',
817 'timestamp': 1710188248,
818 'upload_date': '20240311',
819 'duration': 104,
50e93e03 820 },
a1cf3e38
S
821 }, {
822 # single video article embedded with data-media-vpid
823 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
824 'only_matching': True,
6d155707 825 }, {
50e93e03 826 # bbcthreeConfig
6d155707
S
827 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
828 'info_dict': {
829 'id': 'p06556y7',
830 'ext': 'mp4',
50e93e03 831 'title': 'Things Not To Say to people that live on council estates',
832 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
833 'duration': 360,
834 'thumbnail': r're:https?://.+/.+\.jpg',
6d155707 835 },
b96b4be4
RA
836 }, {
837 # window.__PRELOADED_STATE__
838 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
839 'info_dict': {
840 'id': 'b0b9z4vz',
841 'ext': 'mp4',
842 'title': 'Prom 6: An American in Paris and Turangalila',
843 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
844 'uploader': 'Radio 3',
845 'uploader_id': 'bbc_radio_three',
846 },
7975ddf2 847 'skip': '404 Not Found',
373941c5
S
848 }, {
849 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
850 'info_dict': {
851 'id': 'p06w9tws',
852 'ext': 'mp4',
853 'title': 'md5:2fabf12a726603193a2879a055f72514',
854 'description': 'Learn English words and phrases from this story',
7975ddf2 855 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
373941c5
S
856 },
857 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 858 }, {
859 # BBC Reel
860 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
861 'info_dict': {
862 'id': 'p07c6sb9',
863 'ext': 'mp4',
7975ddf2 864 'title': 'The downsides of positive thinking',
865 'description': 'The downsides of positive thinking',
3721515b 866 'duration': 235,
7975ddf2 867 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
868 'upload_date': '20220223',
869 'timestamp': 1645632746,
3721515b 870 },
eda0e415 871 }, {
872 # BBC Sounds
7975ddf2 873 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
eda0e415 874 'info_dict': {
7975ddf2 875 'id': 'p0hrw4nr',
eda0e415 876 'ext': 'mp4',
7975ddf2 877 'title': 'Are our coastlines being washed away?',
878 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
879 'timestamp': 1713556800,
880 'upload_date': '20240419',
881 'duration': 1588,
882 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
883 'uploader': 'World Service',
884 'uploader_id': 'bbc_world_service',
885 'series': 'CrowdScience',
886 'chapters': [],
887 }
ed13a772 888 }, { # onion routes
889 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
890 'only_matching': True,
891 }, {
892 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
893 'only_matching': True,
10273d6e 894 }]
895
9afa1770
S
896 @classmethod
897 def suitable(cls, url):
1418a043 898 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
899 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
900 else super(BBCIE, cls).suitable(url))
9afa1770
S
901
902 def _extract_from_media_meta(self, media_meta, video_id):
903 # Direct links to media in media metadata (e.g.
904 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
905 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
906 source_files = media_meta.get('sourceFiles')
907 if source_files:
908 return [{
909 'url': f['url'],
910 'format_id': format_id,
911 'ext': f.get('encoding'),
912 'tbr': float_or_none(f.get('bitrate'), 1000),
913 'filesize': int_or_none(f.get('filesize')),
914 } for format_id, f in source_files.items() if f.get('url')], []
915
916 programme_id = media_meta.get('externalId')
917 if programme_id:
918 return self._download_media_selector(programme_id)
919
920 # Process playlist.sxml as legacy playlist
921 href = media_meta.get('href')
922 if href:
923 playlist = self._download_legacy_playlist_url(href)
924 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
925 return formats, subtitles
926
927 return [], []
928
baf39a1a
S
929 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
930 programme_id, title, description, duration, formats, subtitles = \
931 self._process_legacy_playlist_url(url, playlist_id)
baf39a1a
S
932 return {
933 'id': programme_id,
934 'title': title,
935 'description': description,
936 'duration': duration,
937 'timestamp': timestamp,
938 'formats': formats,
939 'subtitles': subtitles,
940 }
941
10273d6e 942 def _real_extract(self, url):
9afa1770
S
943 playlist_id = self._match_id(url)
944
945 webpage = self._download_webpage(url, playlist_id)
946
522f6c06 947 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 948 timestamp = json_ld_info.get('timestamp')
0e832c2c 949
62b8dac4 950 playlist_title = json_ld_info.get('title') or re.sub(
951 r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
0e832c2c
S
952
953 playlist_description = json_ld_info.get(
954 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
955
956 if not timestamp:
957 timestamp = parse_iso8601(self._search_regex(
958 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
959 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 960 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 961 webpage, 'date', default=None))
9afa1770 962
78f9d843
S
963 entries = []
964
de665713
S
965 # article with multiple videos embedded with playlist.sxml (e.g.
966 # http://www.bbc.com/sport/0/football/34475836)
967 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 968 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 969 if playlists:
baf39a1a
S
970 entries = [
971 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
972 for playlist_url in playlists]
de939d89 973
78f9d843
S
974 # news article with multiple videos embedded with data-playable
975 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
976 if data_playables:
977 for _, data_playable_json in data_playables:
978 data_playable = self._parse_json(
979 unescapeHTML(data_playable_json), playlist_id, fatal=False)
980 if not data_playable:
981 continue
baf39a1a
S
982 settings = data_playable.get('settings', {})
983 if settings:
78f9d843
S
984 # data-playable with video vpid in settings.playlistObject.items (e.g.
985 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
986 playlist_object = settings.get('playlistObject', {})
987 if playlist_object:
988 items = playlist_object.get('items')
989 if items and isinstance(items, list):
78f9d843
S
990 title = playlist_object['title']
991 description = playlist_object.get('summary')
baf39a1a
S
992 duration = int_or_none(items[0].get('duration'))
993 programme_id = items[0].get('vpid')
78f9d843 994 formats, subtitles = self._download_media_selector(programme_id)
78f9d843
S
995 entries.append({
996 'id': programme_id,
997 'title': title,
998 'description': description,
999 'timestamp': timestamp,
1000 'duration': duration,
1001 'formats': formats,
1002 'subtitles': subtitles,
1003 })
1004 else:
1005 # data-playable without vpid but with a playlist.sxml URLs
1006 # in otherSettings.playlist (e.g.
1007 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
1008 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
1009 if playlist:
a7e5f274
RA
1010 entry = None
1011 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
1012 playlist_url = playlist.get('%sUrl' % key)
1013 if not playlist_url:
1014 continue
1015 try:
a7e5f274
RA
1016 info = self._extract_from_playlist_sxml(
1017 playlist_url, playlist_id, timestamp)
1018 if not entry:
1019 entry = info
1020 else:
1021 entry['title'] = info['title']
1022 entry['formats'].extend(info['formats'])
3721515b 1023 except ExtractorError as e:
05087d1b
S
1024 # Some playlist URL may fail with 500, at the same time
1025 # the other one may work fine (e.g.
1026 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
3d2623a8 1027 if isinstance(e.cause, HTTPError) and e.cause.status == 500:
05087d1b
S
1028 continue
1029 raise
a7e5f274 1030 if entry:
a7e5f274 1031 entries.append(entry)
78f9d843
S
1032
1033 if entries:
78f9d843
S
1034 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1035
373941c5
S
1036 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
1037 group_id = self._search_regex(
1038 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
1039 webpage, 'group id', default=None)
38d70284 1040 if group_id:
373941c5 1041 return self.url_result(
7975ddf2 1042 f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
373941c5 1043
78f9d843
S
1044 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1045 programme_id = self._search_regex(
a1cf3e38 1046 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
1047 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1048 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 1049 webpage, 'vpid', default=None)
dab062fb 1050
9afa1770
S
1051 if programme_id:
1052 formats, subtitles = self._download_media_selector(programme_id)
9afa1770
S
1053 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1054 digital_data = self._parse_json(
1055 self._search_regex(
1056 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1057 programme_id, fatal=False)
1058 page_info = digital_data.get('page', {}).get('pageInfo', {})
1059 title = page_info.get('pageName') or self._og_search_title(webpage)
1060 description = page_info.get('description') or self._og_search_description(webpage)
1061 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1062 return {
1063 'id': programme_id,
1064 'title': title,
1065 'description': description,
1066 'timestamp': timestamp,
1067 'formats': formats,
1068 'subtitles': subtitles,
1069 }
a3bfddfa 1070
3721515b 1071 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1072 initial_data = self._parse_json(self._html_search_regex(
1073 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1074 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1075 if initial_data:
1076 init_data = try_get(
1077 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1078 smp_data = init_data.get('smpData') or {}
1079 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1080 version_id = clip_data.get('versionID')
1081 if version_id:
1082 title = smp_data['title']
1083 formats, subtitles = self._download_media_selector(version_id)
3721515b 1084 image_url = smp_data.get('holdingImageURL')
1085 display_date = init_data.get('displayDate')
1086 topic_title = init_data.get('topicTitle')
1087
1088 return {
1089 'id': version_id,
1090 'title': title,
1091 'formats': formats,
1092 'alt_title': init_data.get('shortTitle'),
1093 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1094 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1095 'upload_date': display_date.replace('-', '') if display_date else None,
1096 'subtitles': subtitles,
1097 'duration': int_or_none(clip_data.get('duration')),
1098 'categories': [topic_title] if topic_title else None,
1099 }
1100
9fb64c04 1101 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
7975ddf2 1102 # Several setPayload calls may be present but the video(s)
1103 # should be in one that mentions leadMedia or videoData
1104 morph_payload = self._search_json(
1105 r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
1106 contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
1107 default={})
9fb64c04 1108 if morph_payload:
7975ddf2 1109 for lead_media in traverse_obj(morph_payload, (
1110 'body', 'components', ..., 'props', 'leadMedia', {dict})):
1111 programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
9fb64c04
S
1112 if not programme_id:
1113 continue
9fb64c04 1114 formats, subtitles = self._download_media_selector(programme_id)
9fb64c04
S
1115 return {
1116 'id': programme_id,
7975ddf2 1117 'title': lead_media.get('title') or self._og_search_title(webpage),
1118 **traverse_obj(lead_media, {
1119 'description': ('summary', {str}),
1120 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
1121 'uploader': ('masterBrand', {str}),
1122 'uploader_id': ('mid', {str}),
1123 }),
9fb64c04
S
1124 'formats': formats,
1125 'subtitles': subtitles,
1126 }
7975ddf2 1127 body = self._parse_json(traverse_obj(morph_payload, (
1128 'body', 'content', 'article', 'body')), playlist_id, fatal=False)
1129 for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
1130 if video_data.get('vpid'):
1131 video_id = video_data['vpid']
1132 formats, subtitles = self._download_media_selector(video_id)
1133 entry = {
1134 'id': video_id,
1135 'formats': formats,
1136 'subtitles': subtitles,
1137 }
1138 else:
1139 video_id = video_data['pid']
1140 entry = self.url_result(
1141 f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
1142 video_id, url_transparent=True)
1143 entry.update({
1144 'timestamp': traverse_obj(morph_payload, (
1145 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601})
1146 ),
1147 **traverse_obj(video_data, {
1148 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
1149 'title': (('title', 'caption'), {str}, any),
1150 'duration': ('duration', {parse_duration}),
1151 }),
1152 })
1153 if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
1154 return entry
1155 entries.append(entry)
1156 if entries:
1157 playlist_title = traverse_obj(morph_payload, (
1158 'body', 'content', 'article', 'headline', {str})) or playlist_title
1159 return self.playlist_result(
1160 entries, playlist_id, playlist_title, playlist_description)
9fb64c04 1161
7975ddf2 1162 # various PRELOADED_STATE JSON
1163 preload_state = self._search_json(
1164 r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
1165 'preload state', playlist_id, transform_source=js_to_json, default={})
1166 # PRELOADED_STATE with current programmme
1167 current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
1168 programme_id = traverse_obj(current_programme, ('id', {str}))
1169 if programme_id and current_programme.get('type') == 'playable_item':
1170 title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
1171 formats, subtitles = self._download_media_selector(programme_id)
1172 return {
1173 'id': programme_id,
1174 'title': title,
1175 'formats': formats,
1176 **traverse_obj(current_programme, {
1177 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
1178 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
1179 'duration': ('duration', 'value', {int_or_none}),
1180 'uploader': ('network', 'short_title', {str}),
1181 'uploader_id': ('network', 'id', {str}),
1182 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
1183 'series': ('titles', 'primary', {str}),
1184 }),
1185 'subtitles': subtitles,
1186 'chapters': traverse_obj(preload_state, (
1187 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
1188 'title': ('titles', {lambda x: join_nonempty(
1189 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
1190 'start_time': ('offset', 'start', {float_or_none}),
1191 'end_time': ('offset', 'end', {float_or_none}),
1192 })
1193 ),
1194 }
1195
1196 # PWA_PRELOADED_STATE with article video asset
1197 asset_id = traverse_obj(preload_state, (
1198 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
1199 'assetVideo', 0, {str}, any))
1200 if asset_id:
1201 video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
1202 if video_id:
1203 article = traverse_obj(preload_state, (
1204 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
1205
1206 def image_url(image_id):
1207 return traverse_obj(preload_state, (
1208 'entities', 'images', image_id, 'url',
1209 {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
1210
1211 formats, subtitles = self._download_media_selector(video_id)
b96b4be4 1212 return {
7975ddf2 1213 'id': video_id,
1214 **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
1215 'title': ('title', {str}),
1216 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
1217 'thumbnail': (0, {image_url}),
1218 'duration': ('duration', {int_or_none}),
1219 })),
b96b4be4
RA
1220 'formats': formats,
1221 'subtitles': subtitles,
7975ddf2 1222 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
b96b4be4 1223 }
7975ddf2 1224 else:
1225 return self.url_result(
1226 f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
1227 asset_id, playlist_title, display_id=playlist_id,
1228 description=playlist_description)
b96b4be4 1229
6d155707
S
1230 bbc3_config = self._parse_json(
1231 self._search_regex(
1232 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1233 'bbcthree config', default='{}'),
38d70284 1234 playlist_id, transform_source=js_to_json, fatal=False) or {}
1235 payload = bbc3_config.get('payload') or {}
1236 if payload:
1237 clip = payload.get('currentClip') or {}
1238 clip_vpid = clip.get('vpid')
1239 clip_title = clip.get('title')
1240 if clip_vpid and clip_title:
1241 formats, subtitles = self._download_media_selector(clip_vpid)
38d70284 1242 return {
1243 'id': clip_vpid,
1244 'title': clip_title,
1245 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1246 'description': clip.get('description'),
1247 'duration': parse_duration(clip.get('duration')),
1248 'formats': formats,
1249 'subtitles': subtitles,
1250 }
6d155707 1251 bbc3_playlist = try_get(
38d70284 1252 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1253 dict)
1254 if bbc3_playlist:
1255 playlist_title = bbc3_playlist.get('title') or playlist_title
1256 thumbnail = bbc3_playlist.get('holdingImageURL')
1257 entries = []
1258 for bbc3_item in bbc3_playlist['items']:
1259 programme_id = bbc3_item.get('versionID')
1260 if not programme_id:
1261 continue
1262 formats, subtitles = self._download_media_selector(programme_id)
6d155707
S
1263 entries.append({
1264 'id': programme_id,
1265 'title': playlist_title,
1266 'thumbnail': thumbnail,
1267 'timestamp': timestamp,
1268 'formats': formats,
1269 'subtitles': subtitles,
1270 })
1271 return self.playlist_result(
1272 entries, playlist_id, playlist_title, playlist_description)
1273
7975ddf2 1274 def parse_model(model):
1275 """Extract single video from model structure"""
1276 item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
1277 if not item_id:
1278 return
1279 formats, subtitles = self._download_media_selector(item_id)
1280 return {
1281 'id': item_id,
1282 'formats': formats,
1283 'subtitles': subtitles,
1284 **traverse_obj(model, {
1285 'title': ('title', {str}),
1286 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
1287 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
1288 'duration': ('versions', 0, 'duration', {int}),
1289 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
1290 })
1291 }
1292
1293 def is_type(*types):
1294 return lambda _, v: v['type'] in types
1295
50e93e03 1296 initial_data = self._search_regex(
1297 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1298 'quoted preload state', default=None)
1299 if initial_data is None:
1300 initial_data = self._search_regex(
1301 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
19741ab8 1302 'preload state', default='{}')
50e93e03 1303 else:
1304 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1305 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
38d70284 1306 if initial_data:
7975ddf2 1307 for video_data in traverse_obj(initial_data, (
1308 'stores', 'article', 'articleBodyContent', is_type('video'))):
1309 model = traverse_obj(video_data, (
1310 'model', 'blocks', is_type('aresMedia'),
1311 'model', 'blocks', is_type('aresMediaMetadata'),
1312 'model', {dict}, any))
1313 entry = parse_model(model)
1314 if entry:
1315 entries.append(entry)
1316 if entries:
1317 return self.playlist_result(
1318 entries, playlist_id, playlist_title, playlist_description)
1319
38d70284 1320 def parse_media(media):
1321 if not media:
1322 return
1323 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1324 item_id = item.get('id')
1325 item_title = item.get('title')
1326 if not (item_id and item_title):
1327 continue
1328 formats, subtitles = self._download_media_selector(item_id)
1bdae7d3 1329 item_desc = None
1330 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1331 if blocks:
1332 summary = []
1333 for block in blocks:
1334 text = try_get(block, lambda x: x['model']['text'], compat_str)
1335 if text:
1336 summary.append(text)
1337 if summary:
1338 item_desc = '\n\n'.join(summary)
1339 item_time = None
1340 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1341 if try_get(meta, lambda x: x['label']) == 'Published':
1342 item_time = unified_timestamp(meta.get('timestamp'))
1343 break
38d70284 1344 entries.append({
1345 'id': item_id,
1346 'title': item_title,
1347 'thumbnail': item.get('holdingImageUrl'),
1348 'formats': formats,
1349 'subtitles': subtitles,
1bdae7d3 1350 'timestamp': item_time,
1351 'description': strip_or_none(item_desc),
7975ddf2 1352 'duration': int_or_none(item.get('duration')),
38d70284 1353 })
7975ddf2 1354
1355 for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
1356 name = resp['name']
38d70284 1357 if name == 'media-experience':
1358 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1359 elif name == 'article':
7975ddf2 1360 for block in traverse_obj(resp, (
1361 'data', (None, ('content', 'model')), 'blocks',
1362 is_type('media', 'video'), 'model', {dict})):
1363 parse_media(block)
38d70284 1364 return self.playlist_result(
1365 entries, playlist_id, playlist_title, playlist_description)
1366
7975ddf2 1367 # extract from SIMORGH_DATA hydration JSON
1368 simorgh_data = self._search_json(
1369 r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
1370 'simorgh data', playlist_id, default={})
1371 if simorgh_data:
1372 done = False
1373 for video_data in traverse_obj(simorgh_data, (
1374 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
1375 model = traverse_obj(video_data, (
1376 'model', 'blocks', is_type('aresMedia'),
1377 'model', 'blocks', is_type('aresMediaMetadata'),
1378 'model', {dict}, any))
1379 if video_data['type'] == 'video':
1380 entry = parse_model(model)
1381 else: # legacyMedia: no duration, subtitles
1382 block_id, entry = traverse_obj(model, ('blockId', {str})), None
1383 media_data = traverse_obj(simorgh_data, (
1384 'pageData', 'promo', 'media',
1385 {lambda x: x if x['id'] == block_id else None}))
1386 formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
1387 'url': ('url', {url_or_none}),
1388 'ext': ('format', {str}),
1389 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
1390 }))
1391 if formats:
1392 entry = {
1393 'id': block_id,
1394 'display_id': playlist_id,
1395 'formats': formats,
1396 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
1397 **traverse_obj(model, {
1398 'title': ('title', {str}),
1399 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
1400 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
1401 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
1402 }),
1403 }
1404 done = True
1405 if entry:
1406 entries.append(entry)
1407 if done:
1408 break
1409 if entries:
1410 return self.playlist_result(
1411 entries, playlist_id, playlist_title, playlist_description)
1412
88ed52ae
S
1413 def extract_all(pattern):
1414 return list(filter(None, map(
1415 lambda s: self._parse_json(s, playlist_id, fatal=False),
1416 re.findall(pattern, webpage))))
1417
7975ddf2 1418 # US accessed article with single embedded video (e.g.
1419 # https://www.bbc.com/news/uk-68546268)
1420 next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
1421 ('props', 'pageProps', 'page'))
1422 model = traverse_obj(next_data, (
1423 ..., 'contents', is_type('video'),
1424 'model', 'blocks', is_type('media'),
1425 'model', 'blocks', is_type('mediaMetadata'),
1426 'model', {dict}, any))
1427 if model and (entry := parse_model(model)):
1428 if not entry.get('timestamp'):
1429 entry['timestamp'] = traverse_obj(next_data, (
1430 ..., 'contents', is_type('timestamp'), 'model',
1431 'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
1432 entries.append(entry)
1433 return self.playlist_result(
1434 entries, playlist_id, playlist_title, playlist_description)
1435
88ed52ae
S
1436 # Multiple video article (e.g.
1437 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1438 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1439 entries = []
1440 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1441 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1442 if embed_url and re.match(EMBED_URL, embed_url):
1443 entries.append(embed_url)
1444 entries.extend(re.findall(
1445 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1446 if entries:
1447 return self.playlist_result(
aaa42cf0 1448 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1449 playlist_id, playlist_title, playlist_description)
9afa1770
S
1450
1451 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1452 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1453
1454 if not medias:
1455 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1456 media_asset = self._search_regex(
1457 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1458 webpage, 'media asset', default=None)
1459 if media_asset:
1460 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1461 medias = []
1462 for video in media_asset_page.get('videos', {}).values():
1463 medias.extend(video.values())
1464
1465 if not medias:
1466 # Multiple video playlist with single `now playing` entry (e.g.
1467 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1468 vxp_playlist = self._parse_json(
9afa1770 1469 self._search_regex(
a346b1ff
S
1470 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1471 webpage, 'playlist data'),
9afa1770 1472 playlist_id)
a346b1ff
S
1473 playlist_medias = []
1474 for item in vxp_playlist:
1475 media = item.get('media')
1476 if not media:
1477 continue
1478 playlist_medias.append(media)
1479 # Download single video if found media with asset id matching the video id from URL
1480 if item.get('advert', {}).get('assetId') == playlist_id:
1481 medias = [media]
1482 break
1483 # Fallback to the whole playlist
1484 if not medias:
1485 medias = playlist_medias
9afa1770
S
1486
1487 entries = []
1488 for num, media_meta in enumerate(medias, start=1):
1489 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
a06916d9 1490 if not formats and not self.get_param('ignore_no_formats'):
9afa1770 1491 continue
10273d6e 1492
9afa1770
S
1493 video_id = media_meta.get('externalId')
1494 if not video_id:
1495 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1496
1497 title = media_meta.get('caption')
1498 if not title:
1499 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1500
1501 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1502
9afa1770
S
1503 images = []
1504 for image in media_meta.get('images', {}).values():
1505 images.extend(image.values())
1506 if 'image' in media_meta:
1507 images.append(media_meta['image'])
1508
1509 thumbnails = [{
1510 'url': image.get('href'),
1511 'width': int_or_none(image.get('width')),
1512 'height': int_or_none(image.get('height')),
1513 } for image in images]
1514
1515 entries.append({
1516 'id': video_id,
10273d6e 1517 'title': title,
9afa1770 1518 'thumbnails': thumbnails,
10273d6e 1519 'duration': duration,
9afa1770 1520 'timestamp': timestamp,
10273d6e 1521 'formats': formats,
1522 'subtitles': subtitles,
a3bfddfa 1523 })
10273d6e 1524
9afa1770 1525 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1526
1527
1528class BBCCoUkArticleIE(InfoExtractor):
92519402 1529 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1530 IE_NAME = 'bbc.co.uk:article'
1531 IE_DESC = 'BBC articles'
1532
1533 _TEST = {
1534 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1535 'info_dict': {
1536 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1537 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1538 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1539 },
1540 'playlist_count': 4,
1541 'add_ie': ['BBCCoUk'],
1542 }
1543
1544 def _real_extract(self, url):
1545 playlist_id = self._match_id(url)
1546
1547 webpage = self._download_webpage(url, playlist_id)
1548
1549 title = self._og_search_title(webpage)
1550 description = self._og_search_description(webpage).strip()
1551
1552 entries = [self.url_result(programme_url) for programme_url in re.findall(
1553 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1554
1555 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1556
1557
1558class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1559 def _entries(self, webpage, url, playlist_id):
1560 single_page = 'page' in compat_urlparse.parse_qs(
1561 compat_urlparse.urlparse(url).query)
1562 for page_num in itertools.count(2):
1563 for video_id in re.findall(
1564 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1565 yield self.url_result(
1566 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1567 if single_page:
1568 return
1569 next_page = self._search_regex(
1570 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1571 webpage, 'next page url', default=None, group='url')
1572 if not next_page:
1573 break
1574 webpage = self._download_webpage(
1575 compat_urlparse.urljoin(url, next_page), playlist_id,
1576 'Downloading page %d' % page_num, page_num)
1577
ded7511a
S
1578 def _real_extract(self, url):
1579 playlist_id = self._match_id(url)
1580
1581 webpage = self._download_webpage(url, playlist_id)
1582
ded7511a
S
1583 title, description = self._extract_title_and_description(webpage)
1584
254e64a2
S
1585 return self.playlist_result(
1586 self._entries(webpage, url, playlist_id),
1587 playlist_id, title, description)
ded7511a
S
1588
1589
1418a043 1590class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1591 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1592
1593 @staticmethod
1594 def _get_default(episode, key, default_key='default'):
1595 return try_get(episode, lambda x: x[key][default_key])
1596
1597 def _get_description(self, data):
1598 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1599 return dict_get(synopsis, ('large', 'medium', 'small'))
1600
1601 def _fetch_page(self, programme_id, per_page, series_id, page):
1602 elements = self._get_elements(self._call_api(
1603 programme_id, per_page, page + 1, series_id))
1604 for element in elements:
1605 episode = self._get_episode(element)
1606 episode_id = episode.get('id')
1607 if not episode_id:
1608 continue
1609 thumbnail = None
1610 image = self._get_episode_image(episode)
1611 if image:
1612 thumbnail = image.replace('{recipe}', 'raw')
1613 category = self._get_default(episode, 'labels', 'category')
1614 yield {
1615 '_type': 'url',
1616 'id': episode_id,
1617 'title': self._get_episode_field(episode, 'subtitle'),
1618 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1619 'thumbnail': thumbnail,
1620 'description': self._get_description(episode),
1621 'categories': [category] if category else None,
1622 'series': self._get_episode_field(episode, 'title'),
1623 'ie_key': BBCCoUkIE.ie_key(),
1624 }
1625
1626 def _real_extract(self, url):
1627 pid = self._match_id(url)
4dfbf869 1628 qs = parse_qs(url)
1418a043 1629 series_id = qs.get('seriesId', [None])[0]
1630 page = qs.get('page', [None])[0]
1631 per_page = 36 if page else self._PAGE_SIZE
1632 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1633 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1634 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1635 return self.playlist_result(
1636 entries, pid, self._get_playlist_title(playlist_data),
1637 self._get_description(playlist_data))
1638
1639
1640class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1641 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1642 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1643 _TESTS = [{
ded7511a
S
1644 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1645 'info_dict': {
1646 'id': 'b05rcz9v',
1647 'title': 'The Disappearance',
1418a043 1648 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1649 },
1418a043 1650 'playlist_mincount': 8,
9158af16 1651 }, {
1418a043 1652 # all seasons
1653 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1654 'info_dict': {
1655 'id': 'b094m5t9',
1656 'title': 'Doctor Foster',
1657 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1658 },
1659 'playlist_mincount': 10,
1660 }, {
1661 # explicit season
1662 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1663 'info_dict': {
1664 'id': 'b094m5t9',
1665 'title': 'Doctor Foster',
1666 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1667 },
1668 'playlist_mincount': 5,
1669 }, {
1670 # all pages
1671 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1672 'info_dict': {
1673 'id': 'm0004c4v',
1674 'title': 'Beechgrove',
1675 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1676 },
1677 'playlist_mincount': 37,
1678 }, {
1679 # explicit page
1680 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1681 'info_dict': {
1682 'id': 'm0004c4v',
1683 'title': 'Beechgrove',
1684 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1685 },
1686 'playlist_mincount': 1,
1687 }]
1688 _PAGE_SIZE = 100
1689 _DESCRIPTION_KEY = 'synopsis'
1690
1691 def _get_episode_image(self, episode):
1692 return self._get_default(episode, 'image')
1693
1694 def _get_episode_field(self, episode, field):
1695 return self._get_default(episode, field)
1696
1697 @staticmethod
1698 def _get_elements(data):
1699 return data['entities']['results']
1700
1701 @staticmethod
1702 def _get_episode(element):
1703 return element.get('episode') or {}
1704
1705 def _call_api(self, pid, per_page, page=1, series_id=None):
1706 variables = {
1707 'id': pid,
1708 'page': page,
1709 'perPage': per_page,
1710 }
1711 if series_id:
1712 variables['sliceId'] = series_id
1713 return self._download_json(
1714 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1715 'Content-Type': 'application/json'
1716 }, data=json.dumps({
1717 'id': '5692d93d5aac8d796a0305e895e61551',
1718 'variables': variables,
1719 }).encode('utf-8'))['data']['programme']
1720
1721 @staticmethod
1722 def _get_playlist_data(data):
1723 return data
1724
1725 def _get_playlist_title(self, data):
1726 return self._get_default(data, 'title')
1727
1728
1729class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1730 IE_NAME = 'bbc.co.uk:iplayer:group'
1731 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1732 _TESTS = [{
9158af16
S
1733 # Available for over a year unlike 30 days for most other programmes
1734 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1735 'info_dict': {
1736 'id': 'p02tcc32',
1737 'title': 'Bohemian Icons',
1738 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1739 },
1740 'playlist_mincount': 10,
1418a043 1741 }, {
1742 # all pages
1743 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1744 'info_dict': {
1745 'id': 'p081d7j7',
1746 'title': 'Music in Scotland',
1747 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1748 },
1749 'playlist_mincount': 47,
1750 }, {
1751 # explicit page
1752 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1753 'info_dict': {
1754 'id': 'p081d7j7',
1755 'title': 'Music in Scotland',
1756 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1757 },
1758 'playlist_mincount': 11,
9158af16 1759 }]
1418a043 1760 _PAGE_SIZE = 200
1761 _DESCRIPTION_KEY = 'synopses'
1762
1763 def _get_episode_image(self, episode):
1764 return self._get_default(episode, 'images', 'standard')
1765
1766 def _get_episode_field(self, episode, field):
1767 return episode.get(field)
1768
1769 @staticmethod
1770 def _get_elements(data):
1771 return data['elements']
1772
1773 @staticmethod
1774 def _get_episode(element):
1775 return element
1776
1777 def _call_api(self, pid, per_page, page=1, series_id=None):
1778 return self._download_json(
1779 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1780 pid, query={
1781 'page': page,
1782 'per_page': per_page,
1783 })['group_episodes']
1784
1785 @staticmethod
1786 def _get_playlist_data(data):
1787 return data['group']
ded7511a 1788
1418a043 1789 def _get_playlist_title(self, data):
1790 return data.get('title')
ded7511a
S
1791
1792
1793class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1794 IE_NAME = 'bbc.co.uk:playlist'
1795 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1796 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1797 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1798 _TESTS = [{
1799 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1800 'info_dict': {
1801 'id': 'b05rcz9v',
1802 'title': 'The Disappearance - Clips - BBC Four',
1803 'description': 'French thriller serial about a missing teenager.',
1804 },
1805 'playlist_mincount': 7,
4f640f28
S
1806 }, {
1807 # multipage playlist, explicit page
1808 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1809 'info_dict': {
1810 'id': 'b00mfl7n',
1811 'title': 'Frozen Planet - Clips - BBC One',
1812 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1813 },
1814 'playlist_mincount': 24,
1815 }, {
1816 # multipage playlist, all pages
1817 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1818 'info_dict': {
1819 'id': 'b00mfl7n',
1820 'title': 'Frozen Planet - Clips - BBC One',
1821 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1822 },
1823 'playlist_mincount': 142,
ded7511a
S
1824 }, {
1825 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1826 'only_matching': True,
1827 }, {
1828 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1829 'only_matching': True,
1830 }, {
1831 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1832 'only_matching': True,
1833 }]
1834
1835 def _extract_title_and_description(self, webpage):
1836 title = self._og_search_title(webpage, fatal=False)
1837 description = self._og_search_description(webpage)
1838 return title, description