]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
Add option `--ignore-no-formats-error`
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
1418a043 4import functools
254e64a2 5import itertools
1418a043 6import json
f0228f56 7import re
082c6c86 8
f13b1e7d 9from .common import InfoExtractor
3721515b 10from ..compat import (
11 compat_etree_Element,
12 compat_HTTPError,
1418a043 13 compat_parse_qs,
14 compat_urllib_parse_urlparse,
3721515b 15 compat_urlparse,
16)
8683b4d8 17from ..utils import (
3721515b 18 ExtractorError,
1418a043 19 OnDemandPagedList,
97067db2 20 clean_html,
9fb64c04 21 dict_get,
9afa1770 22 float_or_none,
97067db2 23 get_element_by_class,
8683b4d8 24 int_or_none,
6d155707 25 js_to_json,
9afa1770
S
26 parse_duration,
27 parse_iso8601,
9fb64c04 28 try_get,
dab062fb 29 unescapeHTML,
f0228f56 30 url_or_none,
97067db2
S
31 urlencode_postdata,
32 urljoin,
8683b4d8 33)
082c6c86 34
d12a1a47 35
f13b1e7d 36class BBCCoUkIE(InfoExtractor):
082c6c86 37 IE_NAME = 'bbc.co.uk'
2e3fd9ec 38 IE_DESC = 'BBC iPlayer'
6f356cbb 39 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
40 _VALID_URL = r'''(?x)
41 https?://
42 (?:www\.)?bbc\.co\.uk/
43 (?:
44 programmes/(?!articles/)|
45 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 46 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 47 radio/player/|
b72305f0 48 sounds/play/|
d3d45e0a 49 events/[^/]+/play/[^/]+/
f20a11ed 50 )
ded7511a 51 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 52 ''' % _ID_REGEX
082c6c86 53
97067db2
S
54 _LOGIN_URL = 'https://account.bbc.com/signin'
55 _NETRC_MACHINE = 'bbc'
56
29f7c58a 57 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
58 _MEDIA_SETS = [
26ccc68b
S
59 # Provides HQ HLS streams with even better quality that pc mediaset but fails
60 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 61 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 62 'iptv-all',
63 'pc',
d12a1a47 64 ]
a8b081a0 65
e6174ee9
S
66 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
67
2e3fd9ec
S
68 _TESTS = [
69 {
f2d0fc68 70 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 71 'info_dict': {
f2d0fc68 72 'id': 'b039d07m',
b1ea6802 73 'ext': 'flv',
acc86c9a 74 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 75 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
76 },
77 'params': {
b1ea6802 78 # rtmp download
2e3fd9ec
S
79 'skip_download': True,
80 }
082c6c86 81 },
2e3fd9ec
S
82 {
83 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
84 'info_dict': {
85 'id': 'b00yng1d',
86 'ext': 'flv',
87 'title': 'The Man in Black: Series 3: The Printed Name',
88 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
89 'duration': 1800,
90 },
91 'params': {
92 # rtmp download
93 'skip_download': True,
c7f0177f
S
94 },
95 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
96 },
97 {
98 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
99 'info_dict': {
100 'id': 'b00yng1d',
101 'ext': 'flv',
17968e44 102 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 103 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 104 'duration': 5100,
2e3fd9ec
S
105 },
106 'params': {
107 # rtmp download
108 'skip_download': True,
109 },
b1ea6802 110 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
111 },
112 {
113 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
114 'info_dict': {
115 'id': 'b03k3pb7',
116 'ext': 'flv',
117 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
118 'description': '2. Invasion',
119 'duration': 3600,
120 },
121 'params': {
122 # rtmp download
123 'skip_download': True,
124 },
b1ea6802 125 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
126 }, {
127 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
128 'info_dict': {
129 'id': 'b04v209v',
130 'ext': 'flv',
131 'title': 'Pete Tong, The Essential New Tune Special',
132 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
133 'duration': 10800,
134 },
135 'params': {
136 # rtmp download
137 'skip_download': True,
a3ef0e1c
YCH
138 },
139 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 140 }, {
5aa535c3 141 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
142 'note': 'Audio',
143 'info_dict': {
5aa535c3 144 'id': 'p022h44j',
b1ea6802 145 'ext': 'flv',
5aa535c3
S
146 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
147 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
148 'duration': 227,
c7e67594
S
149 },
150 'params': {
b1ea6802 151 # rtmp download
c7e67594
S
152 'skip_download': True,
153 }
154 }, {
155 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
156 'note': 'Video',
157 'info_dict': {
158 'id': 'p025c103',
b1ea6802 159 'ext': 'flv',
c7e67594
S
160 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
161 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
162 'duration': 226,
163 },
164 'params': {
b1ea6802 165 # rtmp download
c7e67594
S
166 'skip_download': True,
167 }
e68ae99a
S
168 }, {
169 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
170 'info_dict': {
171 'id': 'p02n76xf',
172 'ext': 'flv',
173 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
174 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
175 'duration': 3540,
176 },
177 'params': {
178 # rtmp download
179 'skip_download': True,
180 },
b1ea6802 181 'skip': 'geolocation',
25fa8d66
YCH
182 }, {
183 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
184 'info_dict': {
185 'id': 'b05zmgw1',
186 'ext': 'flv',
187 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
188 'title': 'Royal Academy Summer Exhibition',
189 'duration': 3540,
190 },
191 'params': {
192 # rtmp download
193 'skip_download': True,
194 },
b1ea6802 195 'skip': 'geolocation',
54914380
S
196 }, {
197 # iptv-all mediaset fails with geolocation however there is no geo restriction
198 # for this programme at all
5aa535c3 199 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 200 'info_dict': {
5aa535c3 201 'id': 'b06rkms3',
54914380 202 'ext': 'flv',
5aa535c3
S
203 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
204 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
205 },
206 'params': {
207 # rtmp download
208 'skip_download': True,
209 },
b1ea6802 210 'skip': 'Now it\'s really geo-restricted',
1ac6e794 211 }, {
067aa17e 212 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
213 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
214 'info_dict': {
215 'id': 'p028bfkj',
b1ea6802 216 'ext': 'flv',
1ac6e794
S
217 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
218 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
219 },
220 'params': {
b1ea6802 221 # rtmp download
1ac6e794
S
222 'skip_download': True,
223 },
b72305f0
J
224 }, {
225 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
226 'note': 'Audio',
227 'info_dict': {
228 'id': 'm0007jz9',
229 'ext': 'mp4',
230 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
231 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
232 'duration': 9840,
233 },
234 'params': {
235 # rtmp download
236 'skip_download': True,
237 }
31763975
S
238 }, {
239 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
240 'only_matching': True,
c7e67594
S
241 }, {
242 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
243 'only_matching': True,
0692ef86
S
244 }, {
245 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
246 'only_matching': True,
f20a11ed
S
247 }, {
248 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
249 'only_matching': True,
72d256c4
S
250 }, {
251 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
252 'only_matching': True,
53647dfd
S
253 }, {
254 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
255 'only_matching': True,
6f356cbb
S
256 }, {
257 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
258 'only_matching': True,
259 }, {
260 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
261 'only_matching': True,
72d256c4 262 }]
2e3fd9ec 263
97067db2
S
264 def _login(self):
265 username, password = self._get_login_info()
266 if username is None:
267 return
268
269 login_page = self._download_webpage(
270 self._LOGIN_URL, None, 'Downloading signin page')
271
272 login_form = self._hidden_inputs(login_page)
273
274 login_form.update({
275 'username': username,
276 'password': password,
277 })
278
279 post_url = urljoin(self._LOGIN_URL, self._search_regex(
280 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
281 'post url', default=self._LOGIN_URL, group='url'))
282
283 response, urlh = self._download_webpage_handle(
284 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
285 headers={'Referer': self._LOGIN_URL})
286
287 if self._LOGIN_URL in urlh.geturl():
288 error = clean_html(get_element_by_class('form-message', response))
289 if error:
290 raise ExtractorError(
291 'Unable to login: %s' % error, expected=True)
292 raise ExtractorError('Unable to log in')
293
294 def _real_initialize(self):
295 self._login()
296
d12a1a47
S
297 class MediaSelectionError(Exception):
298 def __init__(self, id):
299 self.id = id
300
2e3fd9ec
S
301 def _extract_asx_playlist(self, connection, programme_id):
302 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
303 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
304
2e3fd9ec 305 def _extract_items(self, playlist):
e6174ee9
S
306 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
307
2e3fd9ec 308 def _extract_medias(self, media_selection):
29f7c58a 309 error = media_selection.get('result')
310 if error:
311 raise BBCCoUkIE.MediaSelectionError(error)
312 return media_selection.get('media') or []
2e3fd9ec
S
313
314 def _extract_connections(self, media):
29f7c58a 315 return media.get('connection') or []
2e3fd9ec 316
f13b1e7d 317 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
318 subtitles = {}
319 for connection in self._extract_connections(media):
f0228f56
S
320 cc_url = url_or_none(connection.get('href'))
321 if not cc_url:
322 continue
323 captions = self._download_xml(
324 cc_url, programme_id, 'Downloading captions', fatal=False)
ee0ba927 325 if not isinstance(captions, compat_etree_Element):
f0228f56 326 continue
29f7c58a 327 subtitles['en'] = [
f13b1e7d
JMF
328 {
329 'url': connection.get('href'),
330 'ext': 'ttml',
331 },
f13b1e7d 332 ]
29f7c58a 333 break
2e3fd9ec 334 return subtitles
082c6c86 335
d12a1a47
S
336 def _raise_extractor_error(self, media_selection_error):
337 raise ExtractorError(
338 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
339 expected=True)
340
c056efa2 341 def _download_media_selector(self, programme_id):
d12a1a47 342 last_exception = None
29f7c58a 343 for media_set in self._MEDIA_SETS:
d12a1a47
S
344 try:
345 return self._download_media_selector_url(
29f7c58a 346 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 347 except BBCCoUkIE.MediaSelectionError as e:
d781e293 348 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
349 last_exception = e
350 continue
351 self._raise_extractor_error(e)
352 self._raise_extractor_error(last_exception)
9afa1770
S
353
354 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 355 media_selection = self._download_json(
356 url, programme_id, 'Downloading media selection JSON',
9283d4ea 357 expected_status=(403, 404))
9afa1770 358 return self._process_media_selector(media_selection, programme_id)
082c6c86 359
9afa1770 360 def _process_media_selector(self, media_selection, programme_id):
082c6c86 361 formats = []
2e3fd9ec 362 subtitles = None
b0af1215 363 urls = []
2e3fd9ec 364
c056efa2
S
365 for media in self._extract_medias(media_selection):
366 kind = media.get('kind')
a7e5f274
RA
367 if kind in ('video', 'audio'):
368 bitrate = int_or_none(media.get('bitrate'))
369 encoding = media.get('encoding')
a7e5f274
RA
370 width = int_or_none(media.get('width'))
371 height = int_or_none(media.get('height'))
372 file_size = int_or_none(media.get('media_file_size'))
373 for connection in self._extract_connections(media):
b0af1215
RA
374 href = connection.get('href')
375 if href in urls:
376 continue
377 if href:
378 urls.append(href)
a7e5f274
RA
379 conn_kind = connection.get('kind')
380 protocol = connection.get('protocol')
381 supplier = connection.get('supplier')
a7e5f274
RA
382 transfer_format = connection.get('transferFormat')
383 format_id = supplier or conn_kind or protocol
a7e5f274
RA
384 # ASX playlist
385 if supplier == 'asx':
386 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
387 formats.append({
388 'url': ref,
389 'format_id': 'ref%s_%s' % (i, format_id),
390 })
391 elif transfer_format == 'dash':
392 formats.extend(self._extract_mpd_formats(
393 href, programme_id, mpd_id=format_id, fatal=False))
394 elif transfer_format == 'hls':
395 formats.extend(self._extract_m3u8_formats(
396 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
397 m3u8_id=format_id, fatal=False))
398 elif transfer_format == 'hds':
399 formats.extend(self._extract_f4m_formats(
400 href, programme_id, f4m_id=format_id, fatal=False))
401 else:
29f7c58a 402 if not supplier and bitrate:
aaa42cf0 403 format_id += '-%d' % bitrate
a7e5f274
RA
404 fmt = {
405 'format_id': format_id,
406 'filesize': file_size,
407 }
408 if kind == 'video':
409 fmt.update({
410 'width': width,
411 'height': height,
6240925b 412 'tbr': bitrate,
a7e5f274
RA
413 'vcodec': encoding,
414 })
415 else:
416 fmt.update({
417 'abr': bitrate,
418 'acodec': encoding,
419 'vcodec': 'none',
420 })
1af959ef 421 if protocol in ('http', 'https'):
a7e5f274
RA
422 # Direct link
423 fmt.update({
424 'url': href,
425 })
426 elif protocol == 'rtmp':
427 application = connection.get('application', 'ondemand')
428 auth_string = connection.get('authString')
429 identifier = connection.get('identifier')
430 server = connection.get('server')
431 fmt.update({
432 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
433 'play_path': identifier,
434 'app': '%s?%s' % (application, auth_string),
435 'page_url': 'http://www.bbc.co.uk',
436 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
437 'rtmp_live': False,
438 'ext': 'flv',
439 })
964744af
S
440 else:
441 continue
a7e5f274 442 formats.append(fmt)
c056efa2 443 elif kind == 'captions':
f13b1e7d 444 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 445 return formats, subtitles
2e3fd9ec 446
ae6986fb
S
447 def _download_playlist(self, playlist_id):
448 try:
449 playlist = self._download_json(
450 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
451 playlist_id, 'Downloading playlist JSON')
452
453 version = playlist.get('defaultAvailableVersion')
454 if version:
455 smp_config = version['smpConfig']
456 title = smp_config['title']
457 description = smp_config['summary']
458 for item in smp_config['items']:
459 kind = item['kind']
40fcba5e 460 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
461 continue
462 programme_id = item.get('vpid')
d97f5cd7 463 duration = int_or_none(item.get('duration'))
ae6986fb
S
464 formats, subtitles = self._download_media_selector(programme_id)
465 return programme_id, title, description, duration, formats, subtitles
466 except ExtractorError as ee:
f813928e 467 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
468 raise
469
470 # fallback to legacy playlist
9afa1770
S
471 return self._process_legacy_playlist(playlist_id)
472
473 def _process_legacy_playlist_url(self, url, display_id):
474 playlist = self._download_legacy_playlist_url(url, display_id)
475 return self._extract_from_legacy_playlist(playlist, display_id)
476
477 def _process_legacy_playlist(self, playlist_id):
478 return self._process_legacy_playlist_url(
479 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
480
481 def _download_legacy_playlist_url(self, url, playlist_id=None):
482 return self._download_xml(
483 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 484
9afa1770 485 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 486 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
487 if no_items is not None:
488 reason = no_items.get('reason')
489 if reason == 'preAvailability':
490 msg = 'Episode %s is not yet available' % playlist_id
491 elif reason == 'postAvailability':
492 msg = 'Episode %s is no longer available' % playlist_id
493 elif reason == 'noMedia':
494 msg = 'Episode %s is not currently available' % playlist_id
495 else:
496 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
497 raise ExtractorError(msg, expected=True)
498
499 for item in self._extract_items(playlist):
500 kind = item.get('kind')
40fcba5e 501 if kind not in ('programme', 'radioProgramme'):
ae6986fb 502 continue
e6174ee9
S
503 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
504 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 505 description = description_el.text if description_el is not None else None
9afa1770
S
506
507 def get_programme_id(item):
508 def get_from_attributes(item):
32759325 509 for p in ('identifier', 'group'):
9afa1770
S
510 value = item.get(p)
511 if value and re.match(r'^[pb][\da-z]{7}$', value):
512 return value
513 get_from_attributes(item)
e6174ee9 514 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
515 if mediator is not None:
516 return get_from_attributes(mediator)
517
518 programme_id = get_programme_id(item)
d97f5cd7 519 duration = int_or_none(item.get('duration'))
e6174ee9
S
520
521 if programme_id:
522 formats, subtitles = self._download_media_selector(programme_id)
523 else:
524 formats, subtitles = self._process_media_selector(item, playlist_id)
525 programme_id = playlist_id
ae6986fb
S
526
527 return programme_id, title, description, duration, formats, subtitles
528
c056efa2
S
529 def _real_extract(self, url):
530 group_id = self._match_id(url)
531
532 webpage = self._download_webpage(url, group_id, 'Downloading video page')
533
b2ed954f 534 error = self._search_regex(
29f7c58a 535 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
536 webpage, 'error', default=None)
537 if error:
538 raise ExtractorError(error, expected=True)
539
8683b4d8 540 programme_id = None
679bacf0 541 duration = None
8683b4d8
S
542
543 tviplayer = self._search_regex(
544 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
545 webpage, 'player', default=None)
546
547 if tviplayer:
548 player = self._parse_json(tviplayer, group_id).get('player', {})
549 duration = int_or_none(player.get('duration'))
550 programme_id = player.get('vpid')
551
552 if not programme_id:
553 programme_id = self._search_regex(
22d7368d 554 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 555
c056efa2 556 if programme_id:
c056efa2 557 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 558 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
559 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
560 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 561 description = self._search_regex(
a8534274
S
562 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
563 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
564 webpage, 'description', default=None)
565 if not description:
566 description = self._html_search_meta('description', webpage)
c056efa2 567 else:
ae6986fb 568 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 569
082c6c86
S
570 self._sort_formats(formats)
571
572 return {
2e3fd9ec 573 'id': programme_id,
082c6c86
S
574 'title': title,
575 'description': description,
650cfd0c 576 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
577 'duration': duration,
578 'formats': formats,
2e3fd9ec 579 'subtitles': subtitles,
5f6a1245 580 }
10273d6e 581
582
9afa1770
S
583class BBCIE(BBCCoUkIE):
584 IE_NAME = 'bbc'
585 IE_DESC = 'BBC'
586 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 587
29f7c58a 588 _MEDIA_SETS = [
589 'mobile-tablet-main',
590 'pc',
d12a1a47 591 ]
10273d6e 592
593 _TESTS = [{
6a747190 594 # article with multiple videos embedded with data-playable containing vpids
10273d6e 595 'url': 'http://www.bbc.com/news/world-europe-32668511',
596 'info_dict': {
597 'id': 'world-europe-32668511',
acc86c9a 598 'title': 'Russia stages massive WW2 parade',
9afa1770 599 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 600 },
601 'playlist_count': 2,
a3bfddfa 602 }, {
6a747190 603 # article with multiple videos embedded with data-playable (more videos)
10273d6e 604 'url': 'http://www.bbc.com/news/business-28299555',
605 'info_dict': {
606 'id': 'business-28299555',
607 'title': 'Farnborough Airshow: Video highlights',
9afa1770 608 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 609 },
610 'playlist_count': 9,
9afa1770 611 'skip': 'Save time',
88ed52ae
S
612 }, {
613 # article with multiple videos embedded with `new SMP()`
6a747190 614 # broken
88ed52ae
S
615 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
616 'info_dict': {
617 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 618 'title': 'BUGGER',
88ed52ae
S
619 },
620 'playlist_count': 18,
a3bfddfa 621 }, {
6a747190 622 # single video embedded with data-playable containing vpid
10273d6e 623 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 624 'info_dict': {
625 'id': 'p02mprgb',
55ebae26 626 'ext': 'mp4',
10273d6e 627 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 628 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 629 'duration': 47,
9afa1770 630 'timestamp': 1427219242,
da92eeae 631 'upload_date': '20150324',
10273d6e 632 },
633 'params': {
9afa1770 634 # rtmp download
10273d6e 635 'skip_download': True,
636 }
a3bfddfa 637 }, {
6a747190
S
638 # article with single video embedded with data-playable containing XML playlist
639 # with direct video links as progressiveDownloadUrl (for now these are extracted)
640 # and playlist with f4m and m3u8 as streamingUrl
de939d89 641 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 642 'info_dict': {
9afa1770 643 'id': '150615_telabyad_kentin_cogu',
de939d89 644 'ext': 'mp4',
ad152e2d 645 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 646 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 647 'timestamp': 1434397334,
da92eeae 648 'upload_date': '20150615',
de939d89 649 },
650 'params': {
651 'skip_download': True,
652 }
c936d8cc 653 }, {
6a747190 654 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 655 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 656 'info_dict': {
9afa1770 657 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 658 'ext': 'mp4',
9afa1770 659 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 660 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 661 'timestamp': 1434713142,
da92eeae 662 'upload_date': '20150619',
de939d89 663 },
664 'params': {
665 'skip_download': True,
666 }
a346b1ff
S
667 }, {
668 # single video from video playlist embedded with vxp-playlist-data JSON
669 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
670 'info_dict': {
671 'id': 'p02w6qjc',
55ebae26 672 'ext': 'mp4',
a346b1ff
S
673 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
674 'duration': 56,
0bc4ee60 675 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
676 },
677 'params': {
678 'skip_download': True,
679 }
9afa1770
S
680 }, {
681 # single video story with digitalData
682 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
683 'info_dict': {
684 'id': 'p02q6gc4',
685 'ext': 'flv',
686 'title': 'Sri Lanka’s spicy secret',
687 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
688 'timestamp': 1437674293,
689 'upload_date': '20150723',
690 },
691 'params': {
692 # rtmp download
693 'skip_download': True,
694 }
695 }, {
696 # single video story without digitalData
697 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
698 'info_dict': {
699 'id': 'p018zqqg',
55ebae26 700 'ext': 'mp4',
9afa1770
S
701 'title': 'Hyundai Santa Fe Sport: Rock star',
702 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
703 'timestamp': 1415867444,
704 'upload_date': '20141113',
9afa1770
S
705 },
706 'params': {
707 # rtmp download
708 'skip_download': True,
709 }
9fb64c04
S
710 }, {
711 # single video embedded with Morph
712 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
713 'info_dict': {
714 'id': 'p041vhd0',
715 'ext': 'mp4',
716 'title': "Nigeria v Japan - Men's First Round",
717 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
718 'duration': 7980,
719 'uploader': 'BBC Sport',
720 'uploader_id': 'bbc_sport',
721 },
722 'params': {
723 # m3u8 download
724 'skip_download': True,
9fb64c04
S
725 },
726 'skip': 'Georestricted to UK',
9afa1770 727 }, {
6a747190 728 # single video with playlist.sxml URL in playlist param
9afa1770
S
729 'url': 'http://www.bbc.com/sport/0/football/33653409',
730 'info_dict': {
731 'id': 'p02xycnp',
55ebae26 732 'ext': 'mp4',
9afa1770 733 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 734 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
735 'duration': 140,
736 },
737 'params': {
738 # rtmp download
739 'skip_download': True,
740 }
b5d48cb1 741 }, {
6a747190 742 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
743 'url': 'http://www.bbc.com/sport/0/football/34475836',
744 'info_dict': {
745 'id': '34475836',
450b233c 746 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 747 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
748 },
749 'playlist_count': 3,
450b233c
S
750 }, {
751 # school report article with single video
752 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
753 'info_dict': {
754 'id': '35744779',
755 'title': 'School which breaks down barriers in Jerusalem',
756 },
757 'playlist_count': 1,
9afa1770
S
758 }, {
759 # single video with playlist URL from weather section
760 'url': 'http://www.bbc.com/weather/features/33601775',
761 'only_matching': True,
762 }, {
763 # custom redirection to www.bbc.com
764 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
765 'only_matching': True,
a1cf3e38
S
766 }, {
767 # single video article embedded with data-media-vpid
768 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
769 'only_matching': True,
6d155707
S
770 }, {
771 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
772 'info_dict': {
773 'id': 'p06556y7',
774 'ext': 'mp4',
775 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
776 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
777 },
778 'params': {
779 'skip_download': True,
780 }
b96b4be4
RA
781 }, {
782 # window.__PRELOADED_STATE__
783 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
784 'info_dict': {
785 'id': 'b0b9z4vz',
786 'ext': 'mp4',
787 'title': 'Prom 6: An American in Paris and Turangalila',
788 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
789 'uploader': 'Radio 3',
790 'uploader_id': 'bbc_radio_three',
791 },
373941c5
S
792 }, {
793 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
794 'info_dict': {
795 'id': 'p06w9tws',
796 'ext': 'mp4',
797 'title': 'md5:2fabf12a726603193a2879a055f72514',
798 'description': 'Learn English words and phrases from this story',
799 },
800 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 801 }, {
802 # BBC Reel
803 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
804 'info_dict': {
805 'id': 'p07c6sb9',
806 'ext': 'mp4',
807 'title': 'How positive thinking is harming your happiness',
808 'alt_title': 'The downsides of positive thinking',
809 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
810 'duration': 235,
811 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
812 'upload_date': '20190604',
813 'categories': ['Psychology'],
814 },
10273d6e 815 }]
816
9afa1770
S
817 @classmethod
818 def suitable(cls, url):
1418a043 819 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
820 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
821 else super(BBCIE, cls).suitable(url))
9afa1770
S
822
823 def _extract_from_media_meta(self, media_meta, video_id):
824 # Direct links to media in media metadata (e.g.
825 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
826 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
827 source_files = media_meta.get('sourceFiles')
828 if source_files:
829 return [{
830 'url': f['url'],
831 'format_id': format_id,
832 'ext': f.get('encoding'),
833 'tbr': float_or_none(f.get('bitrate'), 1000),
834 'filesize': int_or_none(f.get('filesize')),
835 } for format_id, f in source_files.items() if f.get('url')], []
836
837 programme_id = media_meta.get('externalId')
838 if programme_id:
839 return self._download_media_selector(programme_id)
840
841 # Process playlist.sxml as legacy playlist
842 href = media_meta.get('href')
843 if href:
844 playlist = self._download_legacy_playlist_url(href)
845 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
846 return formats, subtitles
847
848 return [], []
849
baf39a1a
S
850 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
851 programme_id, title, description, duration, formats, subtitles = \
852 self._process_legacy_playlist_url(url, playlist_id)
853 self._sort_formats(formats)
854 return {
855 'id': programme_id,
856 'title': title,
857 'description': description,
858 'duration': duration,
859 'timestamp': timestamp,
860 'formats': formats,
861 'subtitles': subtitles,
862 }
863
10273d6e 864 def _real_extract(self, url):
9afa1770
S
865 playlist_id = self._match_id(url)
866
867 webpage = self._download_webpage(url, playlist_id)
868
522f6c06 869 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 870 timestamp = json_ld_info.get('timestamp')
0e832c2c 871
350e02d4 872 playlist_title = json_ld_info.get('title')
0e832c2c
S
873 if not playlist_title:
874 playlist_title = self._og_search_title(
875 webpage, default=None) or self._html_search_regex(
876 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
877 if playlist_title:
878 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
879
880 playlist_description = json_ld_info.get(
881 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
882
883 if not timestamp:
884 timestamp = parse_iso8601(self._search_regex(
885 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
886 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 887 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 888 webpage, 'date', default=None))
9afa1770 889
78f9d843
S
890 entries = []
891
de665713
S
892 # article with multiple videos embedded with playlist.sxml (e.g.
893 # http://www.bbc.com/sport/0/football/34475836)
894 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 895 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 896 if playlists:
baf39a1a
S
897 entries = [
898 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
899 for playlist_url in playlists]
de939d89 900
78f9d843
S
901 # news article with multiple videos embedded with data-playable
902 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
903 if data_playables:
904 for _, data_playable_json in data_playables:
905 data_playable = self._parse_json(
906 unescapeHTML(data_playable_json), playlist_id, fatal=False)
907 if not data_playable:
908 continue
baf39a1a
S
909 settings = data_playable.get('settings', {})
910 if settings:
78f9d843
S
911 # data-playable with video vpid in settings.playlistObject.items (e.g.
912 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
913 playlist_object = settings.get('playlistObject', {})
914 if playlist_object:
915 items = playlist_object.get('items')
916 if items and isinstance(items, list):
78f9d843
S
917 title = playlist_object['title']
918 description = playlist_object.get('summary')
baf39a1a
S
919 duration = int_or_none(items[0].get('duration'))
920 programme_id = items[0].get('vpid')
78f9d843
S
921 formats, subtitles = self._download_media_selector(programme_id)
922 self._sort_formats(formats)
923 entries.append({
924 'id': programme_id,
925 'title': title,
926 'description': description,
927 'timestamp': timestamp,
928 'duration': duration,
929 'formats': formats,
930 'subtitles': subtitles,
931 })
932 else:
933 # data-playable without vpid but with a playlist.sxml URLs
934 # in otherSettings.playlist (e.g.
935 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
936 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
937 if playlist:
a7e5f274
RA
938 entry = None
939 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
940 playlist_url = playlist.get('%sUrl' % key)
941 if not playlist_url:
942 continue
943 try:
a7e5f274
RA
944 info = self._extract_from_playlist_sxml(
945 playlist_url, playlist_id, timestamp)
946 if not entry:
947 entry = info
948 else:
949 entry['title'] = info['title']
950 entry['formats'].extend(info['formats'])
3721515b 951 except ExtractorError as e:
05087d1b
S
952 # Some playlist URL may fail with 500, at the same time
953 # the other one may work fine (e.g.
954 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
955 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
956 continue
957 raise
a7e5f274
RA
958 if entry:
959 self._sort_formats(entry['formats'])
960 entries.append(entry)
78f9d843
S
961
962 if entries:
78f9d843
S
963 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
964
373941c5
S
965 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
966 group_id = self._search_regex(
967 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
968 webpage, 'group id', default=None)
38d70284 969 if group_id:
373941c5
S
970 return self.url_result(
971 'https://www.bbc.co.uk/programmes/%s' % group_id,
972 ie=BBCCoUkIE.ie_key())
973
78f9d843
S
974 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
975 programme_id = self._search_regex(
a1cf3e38 976 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
977 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
978 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 979 webpage, 'vpid', default=None)
dab062fb 980
9afa1770
S
981 if programme_id:
982 formats, subtitles = self._download_media_selector(programme_id)
983 self._sort_formats(formats)
984 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
985 digital_data = self._parse_json(
986 self._search_regex(
987 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
988 programme_id, fatal=False)
989 page_info = digital_data.get('page', {}).get('pageInfo', {})
990 title = page_info.get('pageName') or self._og_search_title(webpage)
991 description = page_info.get('description') or self._og_search_description(webpage)
992 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
993 return {
994 'id': programme_id,
995 'title': title,
996 'description': description,
997 'timestamp': timestamp,
998 'formats': formats,
999 'subtitles': subtitles,
1000 }
a3bfddfa 1001
3721515b 1002 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1003 initial_data = self._parse_json(self._html_search_regex(
1004 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1005 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1006 if initial_data:
1007 init_data = try_get(
1008 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1009 smp_data = init_data.get('smpData') or {}
1010 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1011 version_id = clip_data.get('versionID')
1012 if version_id:
1013 title = smp_data['title']
1014 formats, subtitles = self._download_media_selector(version_id)
1015 self._sort_formats(formats)
1016 image_url = smp_data.get('holdingImageURL')
1017 display_date = init_data.get('displayDate')
1018 topic_title = init_data.get('topicTitle')
1019
1020 return {
1021 'id': version_id,
1022 'title': title,
1023 'formats': formats,
1024 'alt_title': init_data.get('shortTitle'),
1025 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1026 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1027 'upload_date': display_date.replace('-', '') if display_date else None,
1028 'subtitles': subtitles,
1029 'duration': int_or_none(clip_data.get('duration')),
1030 'categories': [topic_title] if topic_title else None,
1031 }
1032
9fb64c04
S
1033 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1034 # There are several setPayload calls may be present but the video
1035 # seems to be always related to the first one
1036 morph_payload = self._parse_json(
1037 self._search_regex(
1038 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1039 webpage, 'morph payload', default='{}'),
1040 playlist_id, fatal=False)
1041 if morph_payload:
1042 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1043 for component in components:
1044 if not isinstance(component, dict):
1045 continue
1046 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1047 if not lead_media:
1048 continue
1049 identifiers = lead_media.get('identifiers')
1050 if not identifiers or not isinstance(identifiers, dict):
1051 continue
1052 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1053 if not programme_id:
1054 continue
1055 title = lead_media.get('title') or self._og_search_title(webpage)
1056 formats, subtitles = self._download_media_selector(programme_id)
1057 self._sort_formats(formats)
1058 description = lead_media.get('summary')
1059 uploader = lead_media.get('masterBrand')
1060 uploader_id = lead_media.get('mid')
1061 duration = None
1062 duration_d = lead_media.get('duration')
1063 if isinstance(duration_d, dict):
1064 duration = parse_duration(dict_get(
1065 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1066 return {
1067 'id': programme_id,
1068 'title': title,
1069 'description': description,
1070 'duration': duration,
1071 'uploader': uploader,
1072 'uploader_id': uploader_id,
1073 'formats': formats,
1074 'subtitles': subtitles,
1075 }
1076
b96b4be4
RA
1077 preload_state = self._parse_json(self._search_regex(
1078 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1079 'preload state', default='{}'), playlist_id, fatal=False)
1080 if preload_state:
1081 current_programme = preload_state.get('programmes', {}).get('current') or {}
1082 programme_id = current_programme.get('id')
1083 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1084 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1085 formats, subtitles = self._download_media_selector(programme_id)
1086 self._sort_formats(formats)
1087 synopses = current_programme.get('synopses') or {}
1088 network = current_programme.get('network') or {}
1089 duration = int_or_none(
1090 current_programme.get('duration', {}).get('value'))
1091 thumbnail = None
1092 image_url = current_programme.get('image_url')
1093 if image_url:
3721515b 1094 thumbnail = image_url.replace('{recipe}', 'raw')
b96b4be4
RA
1095 return {
1096 'id': programme_id,
1097 'title': title,
1098 'description': dict_get(synopses, ('long', 'medium', 'short')),
1099 'thumbnail': thumbnail,
1100 'duration': duration,
1101 'uploader': network.get('short_title'),
1102 'uploader_id': network.get('id'),
1103 'formats': formats,
1104 'subtitles': subtitles,
1105 }
1106
6d155707
S
1107 bbc3_config = self._parse_json(
1108 self._search_regex(
1109 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1110 'bbcthree config', default='{}'),
38d70284 1111 playlist_id, transform_source=js_to_json, fatal=False) or {}
1112 payload = bbc3_config.get('payload') or {}
1113 if payload:
1114 clip = payload.get('currentClip') or {}
1115 clip_vpid = clip.get('vpid')
1116 clip_title = clip.get('title')
1117 if clip_vpid and clip_title:
1118 formats, subtitles = self._download_media_selector(clip_vpid)
1119 self._sort_formats(formats)
1120 return {
1121 'id': clip_vpid,
1122 'title': clip_title,
1123 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1124 'description': clip.get('description'),
1125 'duration': parse_duration(clip.get('duration')),
1126 'formats': formats,
1127 'subtitles': subtitles,
1128 }
6d155707 1129 bbc3_playlist = try_get(
38d70284 1130 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1131 dict)
1132 if bbc3_playlist:
1133 playlist_title = bbc3_playlist.get('title') or playlist_title
1134 thumbnail = bbc3_playlist.get('holdingImageURL')
1135 entries = []
1136 for bbc3_item in bbc3_playlist['items']:
1137 programme_id = bbc3_item.get('versionID')
1138 if not programme_id:
1139 continue
1140 formats, subtitles = self._download_media_selector(programme_id)
1141 self._sort_formats(formats)
1142 entries.append({
1143 'id': programme_id,
1144 'title': playlist_title,
1145 'thumbnail': thumbnail,
1146 'timestamp': timestamp,
1147 'formats': formats,
1148 'subtitles': subtitles,
1149 })
1150 return self.playlist_result(
1151 entries, playlist_id, playlist_title, playlist_description)
1152
38d70284 1153 initial_data = self._parse_json(self._search_regex(
1154 r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
1155 'preload state', default='{}'), playlist_id, fatal=False)
1156 if initial_data:
1157 def parse_media(media):
1158 if not media:
1159 return
1160 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1161 item_id = item.get('id')
1162 item_title = item.get('title')
1163 if not (item_id and item_title):
1164 continue
1165 formats, subtitles = self._download_media_selector(item_id)
1166 self._sort_formats(formats)
1167 entries.append({
1168 'id': item_id,
1169 'title': item_title,
1170 'thumbnail': item.get('holdingImageUrl'),
1171 'formats': formats,
1172 'subtitles': subtitles,
1173 })
1174 for resp in (initial_data.get('data') or {}).values():
1175 name = resp.get('name')
1176 if name == 'media-experience':
1177 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1178 elif name == 'article':
1179 for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
1180 if block.get('type') != 'media':
1181 continue
1182 parse_media(block.get('model'))
1183 return self.playlist_result(
1184 entries, playlist_id, playlist_title, playlist_description)
1185
88ed52ae
S
1186 def extract_all(pattern):
1187 return list(filter(None, map(
1188 lambda s: self._parse_json(s, playlist_id, fatal=False),
1189 re.findall(pattern, webpage))))
1190
1191 # Multiple video article (e.g.
1192 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1193 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1194 entries = []
1195 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1196 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1197 if embed_url and re.match(EMBED_URL, embed_url):
1198 entries.append(embed_url)
1199 entries.extend(re.findall(
1200 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1201 if entries:
1202 return self.playlist_result(
aaa42cf0 1203 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1204 playlist_id, playlist_title, playlist_description)
9afa1770
S
1205
1206 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1207 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1208
1209 if not medias:
1210 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1211 media_asset = self._search_regex(
1212 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1213 webpage, 'media asset', default=None)
1214 if media_asset:
1215 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1216 medias = []
1217 for video in media_asset_page.get('videos', {}).values():
1218 medias.extend(video.values())
1219
1220 if not medias:
1221 # Multiple video playlist with single `now playing` entry (e.g.
1222 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1223 vxp_playlist = self._parse_json(
9afa1770 1224 self._search_regex(
a346b1ff
S
1225 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1226 webpage, 'playlist data'),
9afa1770 1227 playlist_id)
a346b1ff
S
1228 playlist_medias = []
1229 for item in vxp_playlist:
1230 media = item.get('media')
1231 if not media:
1232 continue
1233 playlist_medias.append(media)
1234 # Download single video if found media with asset id matching the video id from URL
1235 if item.get('advert', {}).get('assetId') == playlist_id:
1236 medias = [media]
1237 break
1238 # Fallback to the whole playlist
1239 if not medias:
1240 medias = playlist_medias
9afa1770
S
1241
1242 entries = []
1243 for num, media_meta in enumerate(medias, start=1):
1244 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
b7da73eb 1245 if not formats and not self._downloader.params.get('ignore_no_formats'):
9afa1770 1246 continue
10273d6e 1247 self._sort_formats(formats)
1248
9afa1770
S
1249 video_id = media_meta.get('externalId')
1250 if not video_id:
1251 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1252
1253 title = media_meta.get('caption')
1254 if not title:
1255 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1256
1257 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1258
9afa1770
S
1259 images = []
1260 for image in media_meta.get('images', {}).values():
1261 images.extend(image.values())
1262 if 'image' in media_meta:
1263 images.append(media_meta['image'])
1264
1265 thumbnails = [{
1266 'url': image.get('href'),
1267 'width': int_or_none(image.get('width')),
1268 'height': int_or_none(image.get('height')),
1269 } for image in images]
1270
1271 entries.append({
1272 'id': video_id,
10273d6e 1273 'title': title,
9afa1770 1274 'thumbnails': thumbnails,
10273d6e 1275 'duration': duration,
9afa1770 1276 'timestamp': timestamp,
10273d6e 1277 'formats': formats,
1278 'subtitles': subtitles,
a3bfddfa 1279 })
10273d6e 1280
9afa1770 1281 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1282
1283
1284class BBCCoUkArticleIE(InfoExtractor):
92519402 1285 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1286 IE_NAME = 'bbc.co.uk:article'
1287 IE_DESC = 'BBC articles'
1288
1289 _TEST = {
1290 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1291 'info_dict': {
1292 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1293 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1294 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1295 },
1296 'playlist_count': 4,
1297 'add_ie': ['BBCCoUk'],
1298 }
1299
1300 def _real_extract(self, url):
1301 playlist_id = self._match_id(url)
1302
1303 webpage = self._download_webpage(url, playlist_id)
1304
1305 title = self._og_search_title(webpage)
1306 description = self._og_search_description(webpage).strip()
1307
1308 entries = [self.url_result(programme_url) for programme_url in re.findall(
1309 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1310
1311 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1312
1313
1314class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1315 def _entries(self, webpage, url, playlist_id):
1316 single_page = 'page' in compat_urlparse.parse_qs(
1317 compat_urlparse.urlparse(url).query)
1318 for page_num in itertools.count(2):
1319 for video_id in re.findall(
1320 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1321 yield self.url_result(
1322 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1323 if single_page:
1324 return
1325 next_page = self._search_regex(
1326 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1327 webpage, 'next page url', default=None, group='url')
1328 if not next_page:
1329 break
1330 webpage = self._download_webpage(
1331 compat_urlparse.urljoin(url, next_page), playlist_id,
1332 'Downloading page %d' % page_num, page_num)
1333
ded7511a
S
1334 def _real_extract(self, url):
1335 playlist_id = self._match_id(url)
1336
1337 webpage = self._download_webpage(url, playlist_id)
1338
ded7511a
S
1339 title, description = self._extract_title_and_description(webpage)
1340
254e64a2
S
1341 return self.playlist_result(
1342 self._entries(webpage, url, playlist_id),
1343 playlist_id, title, description)
ded7511a
S
1344
1345
1418a043 1346class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1347 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1348
1349 @staticmethod
1350 def _get_default(episode, key, default_key='default'):
1351 return try_get(episode, lambda x: x[key][default_key])
1352
1353 def _get_description(self, data):
1354 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1355 return dict_get(synopsis, ('large', 'medium', 'small'))
1356
1357 def _fetch_page(self, programme_id, per_page, series_id, page):
1358 elements = self._get_elements(self._call_api(
1359 programme_id, per_page, page + 1, series_id))
1360 for element in elements:
1361 episode = self._get_episode(element)
1362 episode_id = episode.get('id')
1363 if not episode_id:
1364 continue
1365 thumbnail = None
1366 image = self._get_episode_image(episode)
1367 if image:
1368 thumbnail = image.replace('{recipe}', 'raw')
1369 category = self._get_default(episode, 'labels', 'category')
1370 yield {
1371 '_type': 'url',
1372 'id': episode_id,
1373 'title': self._get_episode_field(episode, 'subtitle'),
1374 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1375 'thumbnail': thumbnail,
1376 'description': self._get_description(episode),
1377 'categories': [category] if category else None,
1378 'series': self._get_episode_field(episode, 'title'),
1379 'ie_key': BBCCoUkIE.ie_key(),
1380 }
1381
1382 def _real_extract(self, url):
1383 pid = self._match_id(url)
1384 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1385 series_id = qs.get('seriesId', [None])[0]
1386 page = qs.get('page', [None])[0]
1387 per_page = 36 if page else self._PAGE_SIZE
1388 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1389 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1390 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1391 return self.playlist_result(
1392 entries, pid, self._get_playlist_title(playlist_data),
1393 self._get_description(playlist_data))
1394
1395
1396class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1397 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1398 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1399 _TESTS = [{
ded7511a
S
1400 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1401 'info_dict': {
1402 'id': 'b05rcz9v',
1403 'title': 'The Disappearance',
1418a043 1404 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1405 },
1418a043 1406 'playlist_mincount': 8,
9158af16 1407 }, {
1418a043 1408 # all seasons
1409 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1410 'info_dict': {
1411 'id': 'b094m5t9',
1412 'title': 'Doctor Foster',
1413 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1414 },
1415 'playlist_mincount': 10,
1416 }, {
1417 # explicit season
1418 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1419 'info_dict': {
1420 'id': 'b094m5t9',
1421 'title': 'Doctor Foster',
1422 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1423 },
1424 'playlist_mincount': 5,
1425 }, {
1426 # all pages
1427 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1428 'info_dict': {
1429 'id': 'm0004c4v',
1430 'title': 'Beechgrove',
1431 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1432 },
1433 'playlist_mincount': 37,
1434 }, {
1435 # explicit page
1436 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1437 'info_dict': {
1438 'id': 'm0004c4v',
1439 'title': 'Beechgrove',
1440 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1441 },
1442 'playlist_mincount': 1,
1443 }]
1444 _PAGE_SIZE = 100
1445 _DESCRIPTION_KEY = 'synopsis'
1446
1447 def _get_episode_image(self, episode):
1448 return self._get_default(episode, 'image')
1449
1450 def _get_episode_field(self, episode, field):
1451 return self._get_default(episode, field)
1452
1453 @staticmethod
1454 def _get_elements(data):
1455 return data['entities']['results']
1456
1457 @staticmethod
1458 def _get_episode(element):
1459 return element.get('episode') or {}
1460
1461 def _call_api(self, pid, per_page, page=1, series_id=None):
1462 variables = {
1463 'id': pid,
1464 'page': page,
1465 'perPage': per_page,
1466 }
1467 if series_id:
1468 variables['sliceId'] = series_id
1469 return self._download_json(
1470 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1471 'Content-Type': 'application/json'
1472 }, data=json.dumps({
1473 'id': '5692d93d5aac8d796a0305e895e61551',
1474 'variables': variables,
1475 }).encode('utf-8'))['data']['programme']
1476
1477 @staticmethod
1478 def _get_playlist_data(data):
1479 return data
1480
1481 def _get_playlist_title(self, data):
1482 return self._get_default(data, 'title')
1483
1484
1485class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1486 IE_NAME = 'bbc.co.uk:iplayer:group'
1487 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1488 _TESTS = [{
9158af16
S
1489 # Available for over a year unlike 30 days for most other programmes
1490 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1491 'info_dict': {
1492 'id': 'p02tcc32',
1493 'title': 'Bohemian Icons',
1494 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1495 },
1496 'playlist_mincount': 10,
1418a043 1497 }, {
1498 # all pages
1499 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1500 'info_dict': {
1501 'id': 'p081d7j7',
1502 'title': 'Music in Scotland',
1503 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1504 },
1505 'playlist_mincount': 47,
1506 }, {
1507 # explicit page
1508 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1509 'info_dict': {
1510 'id': 'p081d7j7',
1511 'title': 'Music in Scotland',
1512 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1513 },
1514 'playlist_mincount': 11,
9158af16 1515 }]
1418a043 1516 _PAGE_SIZE = 200
1517 _DESCRIPTION_KEY = 'synopses'
1518
1519 def _get_episode_image(self, episode):
1520 return self._get_default(episode, 'images', 'standard')
1521
1522 def _get_episode_field(self, episode, field):
1523 return episode.get(field)
1524
1525 @staticmethod
1526 def _get_elements(data):
1527 return data['elements']
1528
1529 @staticmethod
1530 def _get_episode(element):
1531 return element
1532
1533 def _call_api(self, pid, per_page, page=1, series_id=None):
1534 return self._download_json(
1535 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1536 pid, query={
1537 'page': page,
1538 'per_page': per_page,
1539 })['group_episodes']
1540
1541 @staticmethod
1542 def _get_playlist_data(data):
1543 return data['group']
ded7511a 1544
1418a043 1545 def _get_playlist_title(self, data):
1546 return data.get('title')
ded7511a
S
1547
1548
1549class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1550 IE_NAME = 'bbc.co.uk:playlist'
1551 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1552 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1553 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1554 _TESTS = [{
1555 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1556 'info_dict': {
1557 'id': 'b05rcz9v',
1558 'title': 'The Disappearance - Clips - BBC Four',
1559 'description': 'French thriller serial about a missing teenager.',
1560 },
1561 'playlist_mincount': 7,
4f640f28
S
1562 }, {
1563 # multipage playlist, explicit page
1564 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1565 'info_dict': {
1566 'id': 'b00mfl7n',
1567 'title': 'Frozen Planet - Clips - BBC One',
1568 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1569 },
1570 'playlist_mincount': 24,
1571 }, {
1572 # multipage playlist, all pages
1573 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1574 'info_dict': {
1575 'id': 'b00mfl7n',
1576 'title': 'Frozen Planet - Clips - BBC One',
1577 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1578 },
1579 'playlist_mincount': 142,
ded7511a
S
1580 }, {
1581 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1582 'only_matching': True,
1583 }, {
1584 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1585 'only_matching': True,
1586 }, {
1587 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1588 'only_matching': True,
1589 }]
1590
1591 def _extract_title_and_description(self, webpage):
1592 title = self._og_search_title(webpage, fatal=False)
1593 description = self._og_search_description(webpage)
1594 return title, description