]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
[cleanup] Revert unnecessary changes in 51d9739f8031fb37d8e25b0e9f1abea561e3d2e3
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
1418a043 4import functools
254e64a2 5import itertools
1418a043 6import json
f0228f56 7import re
082c6c86 8
f13b1e7d 9from .common import InfoExtractor
3721515b 10from ..compat import (
11 compat_etree_Element,
12 compat_HTTPError,
1418a043 13 compat_parse_qs,
1bdae7d3 14 compat_str,
1418a043 15 compat_urllib_parse_urlparse,
3721515b 16 compat_urlparse,
17)
8683b4d8 18from ..utils import (
3721515b 19 ExtractorError,
1418a043 20 OnDemandPagedList,
97067db2 21 clean_html,
9fb64c04 22 dict_get,
9afa1770 23 float_or_none,
97067db2 24 get_element_by_class,
8683b4d8 25 int_or_none,
6d155707 26 js_to_json,
9afa1770
S
27 parse_duration,
28 parse_iso8601,
1bdae7d3 29 strip_or_none,
9fb64c04 30 try_get,
dab062fb 31 unescapeHTML,
1bdae7d3 32 unified_timestamp,
f0228f56 33 url_or_none,
97067db2
S
34 urlencode_postdata,
35 urljoin,
8683b4d8 36)
082c6c86 37
d12a1a47 38
f13b1e7d 39class BBCCoUkIE(InfoExtractor):
082c6c86 40 IE_NAME = 'bbc.co.uk'
2e3fd9ec 41 IE_DESC = 'BBC iPlayer'
6f356cbb 42 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
43 _VALID_URL = r'''(?x)
44 https?://
45 (?:www\.)?bbc\.co\.uk/
46 (?:
47 programmes/(?!articles/)|
48 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 49 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 50 radio/player/|
b72305f0 51 sounds/play/|
d3d45e0a 52 events/[^/]+/play/[^/]+/
f20a11ed 53 )
ded7511a 54 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 55 ''' % _ID_REGEX
082c6c86 56
97067db2
S
57 _LOGIN_URL = 'https://account.bbc.com/signin'
58 _NETRC_MACHINE = 'bbc'
59
29f7c58a 60 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
61 _MEDIA_SETS = [
26ccc68b
S
62 # Provides HQ HLS streams with even better quality that pc mediaset but fails
63 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 64 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 65 'iptv-all',
66 'pc',
d12a1a47 67 ]
a8b081a0 68
e6174ee9
S
69 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
70
2e3fd9ec
S
71 _TESTS = [
72 {
f2d0fc68 73 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 74 'info_dict': {
f2d0fc68 75 'id': 'b039d07m',
b1ea6802 76 'ext': 'flv',
acc86c9a 77 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 78 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
79 },
80 'params': {
b1ea6802 81 # rtmp download
2e3fd9ec
S
82 'skip_download': True,
83 }
082c6c86 84 },
2e3fd9ec
S
85 {
86 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
87 'info_dict': {
88 'id': 'b00yng1d',
89 'ext': 'flv',
90 'title': 'The Man in Black: Series 3: The Printed Name',
91 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
92 'duration': 1800,
93 },
94 'params': {
95 # rtmp download
96 'skip_download': True,
c7f0177f
S
97 },
98 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
99 },
100 {
101 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
102 'info_dict': {
103 'id': 'b00yng1d',
104 'ext': 'flv',
17968e44 105 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 106 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 107 'duration': 5100,
2e3fd9ec
S
108 },
109 'params': {
110 # rtmp download
111 'skip_download': True,
112 },
b1ea6802 113 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
114 },
115 {
116 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
117 'info_dict': {
118 'id': 'b03k3pb7',
119 'ext': 'flv',
120 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
121 'description': '2. Invasion',
122 'duration': 3600,
123 },
124 'params': {
125 # rtmp download
126 'skip_download': True,
127 },
b1ea6802 128 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
129 }, {
130 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
131 'info_dict': {
132 'id': 'b04v209v',
133 'ext': 'flv',
134 'title': 'Pete Tong, The Essential New Tune Special',
135 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
136 'duration': 10800,
137 },
138 'params': {
139 # rtmp download
140 'skip_download': True,
a3ef0e1c
YCH
141 },
142 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 143 }, {
5aa535c3 144 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
145 'note': 'Audio',
146 'info_dict': {
5aa535c3 147 'id': 'p022h44j',
b1ea6802 148 'ext': 'flv',
5aa535c3
S
149 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
150 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
151 'duration': 227,
c7e67594
S
152 },
153 'params': {
b1ea6802 154 # rtmp download
c7e67594
S
155 'skip_download': True,
156 }
157 }, {
158 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
159 'note': 'Video',
160 'info_dict': {
161 'id': 'p025c103',
b1ea6802 162 'ext': 'flv',
c7e67594
S
163 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
164 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
165 'duration': 226,
166 },
167 'params': {
b1ea6802 168 # rtmp download
c7e67594
S
169 'skip_download': True,
170 }
e68ae99a
S
171 }, {
172 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
173 'info_dict': {
174 'id': 'p02n76xf',
175 'ext': 'flv',
176 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
177 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
178 'duration': 3540,
179 },
180 'params': {
181 # rtmp download
182 'skip_download': True,
183 },
b1ea6802 184 'skip': 'geolocation',
25fa8d66
YCH
185 }, {
186 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
187 'info_dict': {
188 'id': 'b05zmgw1',
189 'ext': 'flv',
190 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
191 'title': 'Royal Academy Summer Exhibition',
192 'duration': 3540,
193 },
194 'params': {
195 # rtmp download
196 'skip_download': True,
197 },
b1ea6802 198 'skip': 'geolocation',
54914380
S
199 }, {
200 # iptv-all mediaset fails with geolocation however there is no geo restriction
201 # for this programme at all
5aa535c3 202 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 203 'info_dict': {
5aa535c3 204 'id': 'b06rkms3',
54914380 205 'ext': 'flv',
5aa535c3
S
206 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
207 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
208 },
209 'params': {
210 # rtmp download
211 'skip_download': True,
212 },
b1ea6802 213 'skip': 'Now it\'s really geo-restricted',
1ac6e794 214 }, {
067aa17e 215 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
216 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
217 'info_dict': {
218 'id': 'p028bfkj',
b1ea6802 219 'ext': 'flv',
1ac6e794
S
220 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
221 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
222 },
223 'params': {
b1ea6802 224 # rtmp download
1ac6e794
S
225 'skip_download': True,
226 },
b72305f0
J
227 }, {
228 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
229 'note': 'Audio',
230 'info_dict': {
231 'id': 'm0007jz9',
232 'ext': 'mp4',
233 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
234 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
235 'duration': 9840,
236 },
237 'params': {
238 # rtmp download
239 'skip_download': True,
240 }
31763975
S
241 }, {
242 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
243 'only_matching': True,
c7e67594
S
244 }, {
245 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
246 'only_matching': True,
0692ef86
S
247 }, {
248 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
249 'only_matching': True,
f20a11ed
S
250 }, {
251 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
252 'only_matching': True,
72d256c4
S
253 }, {
254 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
255 'only_matching': True,
53647dfd
S
256 }, {
257 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
258 'only_matching': True,
6f356cbb
S
259 }, {
260 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
261 'only_matching': True,
262 }, {
263 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
264 'only_matching': True,
72d256c4 265 }]
2e3fd9ec 266
97067db2
S
267 def _login(self):
268 username, password = self._get_login_info()
269 if username is None:
270 return
271
272 login_page = self._download_webpage(
273 self._LOGIN_URL, None, 'Downloading signin page')
274
275 login_form = self._hidden_inputs(login_page)
276
277 login_form.update({
278 'username': username,
279 'password': password,
280 })
281
282 post_url = urljoin(self._LOGIN_URL, self._search_regex(
283 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
284 'post url', default=self._LOGIN_URL, group='url'))
285
286 response, urlh = self._download_webpage_handle(
287 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
288 headers={'Referer': self._LOGIN_URL})
289
290 if self._LOGIN_URL in urlh.geturl():
291 error = clean_html(get_element_by_class('form-message', response))
292 if error:
293 raise ExtractorError(
294 'Unable to login: %s' % error, expected=True)
295 raise ExtractorError('Unable to log in')
296
297 def _real_initialize(self):
298 self._login()
299
d12a1a47
S
300 class MediaSelectionError(Exception):
301 def __init__(self, id):
302 self.id = id
303
2e3fd9ec
S
304 def _extract_asx_playlist(self, connection, programme_id):
305 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
306 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
307
2e3fd9ec 308 def _extract_items(self, playlist):
e6174ee9
S
309 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
310
2e3fd9ec 311 def _extract_medias(self, media_selection):
29f7c58a 312 error = media_selection.get('result')
313 if error:
314 raise BBCCoUkIE.MediaSelectionError(error)
315 return media_selection.get('media') or []
2e3fd9ec
S
316
317 def _extract_connections(self, media):
29f7c58a 318 return media.get('connection') or []
2e3fd9ec 319
f13b1e7d 320 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
321 subtitles = {}
322 for connection in self._extract_connections(media):
f0228f56
S
323 cc_url = url_or_none(connection.get('href'))
324 if not cc_url:
325 continue
326 captions = self._download_xml(
327 cc_url, programme_id, 'Downloading captions', fatal=False)
ee0ba927 328 if not isinstance(captions, compat_etree_Element):
f0228f56 329 continue
29f7c58a 330 subtitles['en'] = [
f13b1e7d
JMF
331 {
332 'url': connection.get('href'),
333 'ext': 'ttml',
334 },
f13b1e7d 335 ]
29f7c58a 336 break
2e3fd9ec 337 return subtitles
082c6c86 338
d12a1a47
S
339 def _raise_extractor_error(self, media_selection_error):
340 raise ExtractorError(
341 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
342 expected=True)
343
c056efa2 344 def _download_media_selector(self, programme_id):
d12a1a47 345 last_exception = None
29f7c58a 346 for media_set in self._MEDIA_SETS:
d12a1a47
S
347 try:
348 return self._download_media_selector_url(
29f7c58a 349 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 350 except BBCCoUkIE.MediaSelectionError as e:
d781e293 351 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
352 last_exception = e
353 continue
354 self._raise_extractor_error(e)
355 self._raise_extractor_error(last_exception)
9afa1770
S
356
357 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 358 media_selection = self._download_json(
359 url, programme_id, 'Downloading media selection JSON',
9283d4ea 360 expected_status=(403, 404))
9afa1770 361 return self._process_media_selector(media_selection, programme_id)
082c6c86 362
9afa1770 363 def _process_media_selector(self, media_selection, programme_id):
082c6c86 364 formats = []
2e3fd9ec 365 subtitles = None
b0af1215 366 urls = []
2e3fd9ec 367
c056efa2
S
368 for media in self._extract_medias(media_selection):
369 kind = media.get('kind')
a7e5f274
RA
370 if kind in ('video', 'audio'):
371 bitrate = int_or_none(media.get('bitrate'))
372 encoding = media.get('encoding')
a7e5f274
RA
373 width = int_or_none(media.get('width'))
374 height = int_or_none(media.get('height'))
375 file_size = int_or_none(media.get('media_file_size'))
376 for connection in self._extract_connections(media):
b0af1215
RA
377 href = connection.get('href')
378 if href in urls:
379 continue
380 if href:
381 urls.append(href)
a7e5f274
RA
382 conn_kind = connection.get('kind')
383 protocol = connection.get('protocol')
384 supplier = connection.get('supplier')
a7e5f274
RA
385 transfer_format = connection.get('transferFormat')
386 format_id = supplier or conn_kind or protocol
a7e5f274
RA
387 # ASX playlist
388 if supplier == 'asx':
389 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
390 formats.append({
391 'url': ref,
392 'format_id': 'ref%s_%s' % (i, format_id),
393 })
394 elif transfer_format == 'dash':
395 formats.extend(self._extract_mpd_formats(
396 href, programme_id, mpd_id=format_id, fatal=False))
397 elif transfer_format == 'hls':
398 formats.extend(self._extract_m3u8_formats(
399 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
400 m3u8_id=format_id, fatal=False))
401 elif transfer_format == 'hds':
402 formats.extend(self._extract_f4m_formats(
403 href, programme_id, f4m_id=format_id, fatal=False))
404 else:
29f7c58a 405 if not supplier and bitrate:
aaa42cf0 406 format_id += '-%d' % bitrate
a7e5f274
RA
407 fmt = {
408 'format_id': format_id,
409 'filesize': file_size,
410 }
411 if kind == 'video':
412 fmt.update({
413 'width': width,
414 'height': height,
6240925b 415 'tbr': bitrate,
a7e5f274
RA
416 'vcodec': encoding,
417 })
418 else:
419 fmt.update({
420 'abr': bitrate,
421 'acodec': encoding,
422 'vcodec': 'none',
423 })
1af959ef 424 if protocol in ('http', 'https'):
a7e5f274
RA
425 # Direct link
426 fmt.update({
427 'url': href,
428 })
429 elif protocol == 'rtmp':
430 application = connection.get('application', 'ondemand')
431 auth_string = connection.get('authString')
432 identifier = connection.get('identifier')
433 server = connection.get('server')
434 fmt.update({
435 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
436 'play_path': identifier,
437 'app': '%s?%s' % (application, auth_string),
438 'page_url': 'http://www.bbc.co.uk',
439 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
440 'rtmp_live': False,
441 'ext': 'flv',
442 })
964744af
S
443 else:
444 continue
a7e5f274 445 formats.append(fmt)
c056efa2 446 elif kind == 'captions':
f13b1e7d 447 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 448 return formats, subtitles
2e3fd9ec 449
ae6986fb
S
450 def _download_playlist(self, playlist_id):
451 try:
452 playlist = self._download_json(
453 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
454 playlist_id, 'Downloading playlist JSON')
455
456 version = playlist.get('defaultAvailableVersion')
457 if version:
458 smp_config = version['smpConfig']
459 title = smp_config['title']
460 description = smp_config['summary']
461 for item in smp_config['items']:
462 kind = item['kind']
40fcba5e 463 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
464 continue
465 programme_id = item.get('vpid')
d97f5cd7 466 duration = int_or_none(item.get('duration'))
ae6986fb
S
467 formats, subtitles = self._download_media_selector(programme_id)
468 return programme_id, title, description, duration, formats, subtitles
469 except ExtractorError as ee:
f813928e 470 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
471 raise
472
473 # fallback to legacy playlist
9afa1770
S
474 return self._process_legacy_playlist(playlist_id)
475
476 def _process_legacy_playlist_url(self, url, display_id):
477 playlist = self._download_legacy_playlist_url(url, display_id)
478 return self._extract_from_legacy_playlist(playlist, display_id)
479
480 def _process_legacy_playlist(self, playlist_id):
481 return self._process_legacy_playlist_url(
482 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
483
484 def _download_legacy_playlist_url(self, url, playlist_id=None):
485 return self._download_xml(
486 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 487
9afa1770 488 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 489 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
490 if no_items is not None:
491 reason = no_items.get('reason')
492 if reason == 'preAvailability':
493 msg = 'Episode %s is not yet available' % playlist_id
494 elif reason == 'postAvailability':
495 msg = 'Episode %s is no longer available' % playlist_id
496 elif reason == 'noMedia':
497 msg = 'Episode %s is not currently available' % playlist_id
498 else:
499 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
500 raise ExtractorError(msg, expected=True)
501
502 for item in self._extract_items(playlist):
503 kind = item.get('kind')
40fcba5e 504 if kind not in ('programme', 'radioProgramme'):
ae6986fb 505 continue
e6174ee9
S
506 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
507 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 508 description = description_el.text if description_el is not None else None
9afa1770
S
509
510 def get_programme_id(item):
511 def get_from_attributes(item):
32759325 512 for p in ('identifier', 'group'):
9afa1770
S
513 value = item.get(p)
514 if value and re.match(r'^[pb][\da-z]{7}$', value):
515 return value
516 get_from_attributes(item)
e6174ee9 517 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
518 if mediator is not None:
519 return get_from_attributes(mediator)
520
521 programme_id = get_programme_id(item)
d97f5cd7 522 duration = int_or_none(item.get('duration'))
e6174ee9
S
523
524 if programme_id:
525 formats, subtitles = self._download_media_selector(programme_id)
526 else:
527 formats, subtitles = self._process_media_selector(item, playlist_id)
528 programme_id = playlist_id
ae6986fb
S
529
530 return programme_id, title, description, duration, formats, subtitles
531
c056efa2
S
532 def _real_extract(self, url):
533 group_id = self._match_id(url)
534
535 webpage = self._download_webpage(url, group_id, 'Downloading video page')
536
b2ed954f 537 error = self._search_regex(
29f7c58a 538 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
539 webpage, 'error', default=None)
540 if error:
541 raise ExtractorError(error, expected=True)
542
8683b4d8 543 programme_id = None
679bacf0 544 duration = None
8683b4d8
S
545
546 tviplayer = self._search_regex(
547 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
548 webpage, 'player', default=None)
549
550 if tviplayer:
551 player = self._parse_json(tviplayer, group_id).get('player', {})
552 duration = int_or_none(player.get('duration'))
553 programme_id = player.get('vpid')
554
555 if not programme_id:
556 programme_id = self._search_regex(
22d7368d 557 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 558
c056efa2 559 if programme_id:
c056efa2 560 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 561 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
562 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
563 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 564 description = self._search_regex(
a8534274
S
565 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
566 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
567 webpage, 'description', default=None)
568 if not description:
569 description = self._html_search_meta('description', webpage)
c056efa2 570 else:
ae6986fb 571 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 572
082c6c86
S
573 self._sort_formats(formats)
574
575 return {
2e3fd9ec 576 'id': programme_id,
082c6c86
S
577 'title': title,
578 'description': description,
650cfd0c 579 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
580 'duration': duration,
581 'formats': formats,
2e3fd9ec 582 'subtitles': subtitles,
5f6a1245 583 }
10273d6e 584
585
9afa1770
S
586class BBCIE(BBCCoUkIE):
587 IE_NAME = 'bbc'
588 IE_DESC = 'BBC'
589 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 590
29f7c58a 591 _MEDIA_SETS = [
592 'mobile-tablet-main',
593 'pc',
d12a1a47 594 ]
10273d6e 595
596 _TESTS = [{
6a747190 597 # article with multiple videos embedded with data-playable containing vpids
10273d6e 598 'url': 'http://www.bbc.com/news/world-europe-32668511',
599 'info_dict': {
600 'id': 'world-europe-32668511',
acc86c9a 601 'title': 'Russia stages massive WW2 parade',
9afa1770 602 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 603 },
604 'playlist_count': 2,
a3bfddfa 605 }, {
6a747190 606 # article with multiple videos embedded with data-playable (more videos)
10273d6e 607 'url': 'http://www.bbc.com/news/business-28299555',
608 'info_dict': {
609 'id': 'business-28299555',
610 'title': 'Farnborough Airshow: Video highlights',
9afa1770 611 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 612 },
613 'playlist_count': 9,
9afa1770 614 'skip': 'Save time',
88ed52ae
S
615 }, {
616 # article with multiple videos embedded with `new SMP()`
6a747190 617 # broken
88ed52ae
S
618 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
619 'info_dict': {
620 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 621 'title': 'BUGGER',
88ed52ae
S
622 },
623 'playlist_count': 18,
a3bfddfa 624 }, {
6a747190 625 # single video embedded with data-playable containing vpid
10273d6e 626 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 627 'info_dict': {
628 'id': 'p02mprgb',
55ebae26 629 'ext': 'mp4',
10273d6e 630 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 631 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 632 'duration': 47,
9afa1770 633 'timestamp': 1427219242,
da92eeae 634 'upload_date': '20150324',
10273d6e 635 },
636 'params': {
9afa1770 637 # rtmp download
10273d6e 638 'skip_download': True,
639 }
a3bfddfa 640 }, {
6a747190
S
641 # article with single video embedded with data-playable containing XML playlist
642 # with direct video links as progressiveDownloadUrl (for now these are extracted)
643 # and playlist with f4m and m3u8 as streamingUrl
de939d89 644 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 645 'info_dict': {
9afa1770 646 'id': '150615_telabyad_kentin_cogu',
de939d89 647 'ext': 'mp4',
ad152e2d 648 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 649 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 650 'timestamp': 1434397334,
da92eeae 651 'upload_date': '20150615',
de939d89 652 },
653 'params': {
654 'skip_download': True,
655 }
c936d8cc 656 }, {
6a747190 657 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 658 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 659 'info_dict': {
9afa1770 660 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 661 'ext': 'mp4',
9afa1770 662 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 663 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 664 'timestamp': 1434713142,
da92eeae 665 'upload_date': '20150619',
de939d89 666 },
667 'params': {
668 'skip_download': True,
669 }
a346b1ff
S
670 }, {
671 # single video from video playlist embedded with vxp-playlist-data JSON
672 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
673 'info_dict': {
674 'id': 'p02w6qjc',
55ebae26 675 'ext': 'mp4',
a346b1ff
S
676 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
677 'duration': 56,
0bc4ee60 678 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
679 },
680 'params': {
681 'skip_download': True,
682 }
9afa1770
S
683 }, {
684 # single video story with digitalData
685 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
686 'info_dict': {
687 'id': 'p02q6gc4',
688 'ext': 'flv',
689 'title': 'Sri Lanka’s spicy secret',
690 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
691 'timestamp': 1437674293,
692 'upload_date': '20150723',
693 },
694 'params': {
695 # rtmp download
696 'skip_download': True,
697 }
698 }, {
699 # single video story without digitalData
700 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
701 'info_dict': {
702 'id': 'p018zqqg',
55ebae26 703 'ext': 'mp4',
9afa1770
S
704 'title': 'Hyundai Santa Fe Sport: Rock star',
705 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
706 'timestamp': 1415867444,
707 'upload_date': '20141113',
9afa1770
S
708 },
709 'params': {
710 # rtmp download
711 'skip_download': True,
712 }
9fb64c04
S
713 }, {
714 # single video embedded with Morph
715 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
716 'info_dict': {
717 'id': 'p041vhd0',
718 'ext': 'mp4',
719 'title': "Nigeria v Japan - Men's First Round",
720 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
721 'duration': 7980,
722 'uploader': 'BBC Sport',
723 'uploader_id': 'bbc_sport',
724 },
725 'params': {
726 # m3u8 download
727 'skip_download': True,
9fb64c04
S
728 },
729 'skip': 'Georestricted to UK',
9afa1770 730 }, {
6a747190 731 # single video with playlist.sxml URL in playlist param
9afa1770
S
732 'url': 'http://www.bbc.com/sport/0/football/33653409',
733 'info_dict': {
734 'id': 'p02xycnp',
55ebae26 735 'ext': 'mp4',
9afa1770 736 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 737 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
738 'duration': 140,
739 },
740 'params': {
741 # rtmp download
742 'skip_download': True,
743 }
b5d48cb1 744 }, {
6a747190 745 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
746 'url': 'http://www.bbc.com/sport/0/football/34475836',
747 'info_dict': {
748 'id': '34475836',
450b233c 749 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 750 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
751 },
752 'playlist_count': 3,
450b233c
S
753 }, {
754 # school report article with single video
755 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
756 'info_dict': {
757 'id': '35744779',
758 'title': 'School which breaks down barriers in Jerusalem',
759 },
760 'playlist_count': 1,
9afa1770
S
761 }, {
762 # single video with playlist URL from weather section
763 'url': 'http://www.bbc.com/weather/features/33601775',
764 'only_matching': True,
765 }, {
766 # custom redirection to www.bbc.com
1bdae7d3 767 # also, video with window.__INITIAL_DATA__
9afa1770 768 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
1bdae7d3 769 'info_dict': {
770 'id': 'p02xzws1',
771 'ext': 'mp4',
772 'title': "Pluto may have 'nitrogen glaciers'",
773 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
774 'thumbnail': r're:https?://.+/.+\.jpg',
775 'timestamp': 1437785037,
776 'upload_date': '20150725',
777 },
a1cf3e38
S
778 }, {
779 # single video article embedded with data-media-vpid
780 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
781 'only_matching': True,
6d155707
S
782 }, {
783 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
784 'info_dict': {
785 'id': 'p06556y7',
786 'ext': 'mp4',
787 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
788 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
789 },
790 'params': {
791 'skip_download': True,
792 }
b96b4be4
RA
793 }, {
794 # window.__PRELOADED_STATE__
795 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
796 'info_dict': {
797 'id': 'b0b9z4vz',
798 'ext': 'mp4',
799 'title': 'Prom 6: An American in Paris and Turangalila',
800 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
801 'uploader': 'Radio 3',
802 'uploader_id': 'bbc_radio_three',
803 },
373941c5
S
804 }, {
805 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
806 'info_dict': {
807 'id': 'p06w9tws',
808 'ext': 'mp4',
809 'title': 'md5:2fabf12a726603193a2879a055f72514',
810 'description': 'Learn English words and phrases from this story',
811 },
812 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 813 }, {
814 # BBC Reel
815 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
816 'info_dict': {
817 'id': 'p07c6sb9',
818 'ext': 'mp4',
819 'title': 'How positive thinking is harming your happiness',
820 'alt_title': 'The downsides of positive thinking',
821 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
822 'duration': 235,
823 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
824 'upload_date': '20190604',
825 'categories': ['Psychology'],
826 },
10273d6e 827 }]
828
9afa1770
S
829 @classmethod
830 def suitable(cls, url):
1418a043 831 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
832 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
833 else super(BBCIE, cls).suitable(url))
9afa1770
S
834
835 def _extract_from_media_meta(self, media_meta, video_id):
836 # Direct links to media in media metadata (e.g.
837 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
838 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
839 source_files = media_meta.get('sourceFiles')
840 if source_files:
841 return [{
842 'url': f['url'],
843 'format_id': format_id,
844 'ext': f.get('encoding'),
845 'tbr': float_or_none(f.get('bitrate'), 1000),
846 'filesize': int_or_none(f.get('filesize')),
847 } for format_id, f in source_files.items() if f.get('url')], []
848
849 programme_id = media_meta.get('externalId')
850 if programme_id:
851 return self._download_media_selector(programme_id)
852
853 # Process playlist.sxml as legacy playlist
854 href = media_meta.get('href')
855 if href:
856 playlist = self._download_legacy_playlist_url(href)
857 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
858 return formats, subtitles
859
860 return [], []
861
baf39a1a
S
862 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
863 programme_id, title, description, duration, formats, subtitles = \
864 self._process_legacy_playlist_url(url, playlist_id)
865 self._sort_formats(formats)
866 return {
867 'id': programme_id,
868 'title': title,
869 'description': description,
870 'duration': duration,
871 'timestamp': timestamp,
872 'formats': formats,
873 'subtitles': subtitles,
874 }
875
10273d6e 876 def _real_extract(self, url):
9afa1770
S
877 playlist_id = self._match_id(url)
878
879 webpage = self._download_webpage(url, playlist_id)
880
522f6c06 881 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 882 timestamp = json_ld_info.get('timestamp')
0e832c2c 883
350e02d4 884 playlist_title = json_ld_info.get('title')
0e832c2c
S
885 if not playlist_title:
886 playlist_title = self._og_search_title(
887 webpage, default=None) or self._html_search_regex(
888 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
889 if playlist_title:
890 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
891
892 playlist_description = json_ld_info.get(
893 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
894
895 if not timestamp:
896 timestamp = parse_iso8601(self._search_regex(
897 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
898 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 899 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 900 webpage, 'date', default=None))
9afa1770 901
78f9d843
S
902 entries = []
903
de665713
S
904 # article with multiple videos embedded with playlist.sxml (e.g.
905 # http://www.bbc.com/sport/0/football/34475836)
906 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 907 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 908 if playlists:
baf39a1a
S
909 entries = [
910 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
911 for playlist_url in playlists]
de939d89 912
78f9d843
S
913 # news article with multiple videos embedded with data-playable
914 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
915 if data_playables:
916 for _, data_playable_json in data_playables:
917 data_playable = self._parse_json(
918 unescapeHTML(data_playable_json), playlist_id, fatal=False)
919 if not data_playable:
920 continue
baf39a1a
S
921 settings = data_playable.get('settings', {})
922 if settings:
78f9d843
S
923 # data-playable with video vpid in settings.playlistObject.items (e.g.
924 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
925 playlist_object = settings.get('playlistObject', {})
926 if playlist_object:
927 items = playlist_object.get('items')
928 if items and isinstance(items, list):
78f9d843
S
929 title = playlist_object['title']
930 description = playlist_object.get('summary')
baf39a1a
S
931 duration = int_or_none(items[0].get('duration'))
932 programme_id = items[0].get('vpid')
78f9d843
S
933 formats, subtitles = self._download_media_selector(programme_id)
934 self._sort_formats(formats)
935 entries.append({
936 'id': programme_id,
937 'title': title,
938 'description': description,
939 'timestamp': timestamp,
940 'duration': duration,
941 'formats': formats,
942 'subtitles': subtitles,
943 })
944 else:
945 # data-playable without vpid but with a playlist.sxml URLs
946 # in otherSettings.playlist (e.g.
947 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
948 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
949 if playlist:
a7e5f274
RA
950 entry = None
951 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
952 playlist_url = playlist.get('%sUrl' % key)
953 if not playlist_url:
954 continue
955 try:
a7e5f274
RA
956 info = self._extract_from_playlist_sxml(
957 playlist_url, playlist_id, timestamp)
958 if not entry:
959 entry = info
960 else:
961 entry['title'] = info['title']
962 entry['formats'].extend(info['formats'])
3721515b 963 except ExtractorError as e:
05087d1b
S
964 # Some playlist URL may fail with 500, at the same time
965 # the other one may work fine (e.g.
966 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
967 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
968 continue
969 raise
a7e5f274
RA
970 if entry:
971 self._sort_formats(entry['formats'])
972 entries.append(entry)
78f9d843
S
973
974 if entries:
78f9d843
S
975 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
976
373941c5
S
977 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
978 group_id = self._search_regex(
979 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
980 webpage, 'group id', default=None)
38d70284 981 if group_id:
373941c5
S
982 return self.url_result(
983 'https://www.bbc.co.uk/programmes/%s' % group_id,
984 ie=BBCCoUkIE.ie_key())
985
78f9d843
S
986 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
987 programme_id = self._search_regex(
a1cf3e38 988 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
989 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
990 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 991 webpage, 'vpid', default=None)
dab062fb 992
9afa1770
S
993 if programme_id:
994 formats, subtitles = self._download_media_selector(programme_id)
995 self._sort_formats(formats)
996 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
997 digital_data = self._parse_json(
998 self._search_regex(
999 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1000 programme_id, fatal=False)
1001 page_info = digital_data.get('page', {}).get('pageInfo', {})
1002 title = page_info.get('pageName') or self._og_search_title(webpage)
1003 description = page_info.get('description') or self._og_search_description(webpage)
1004 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1005 return {
1006 'id': programme_id,
1007 'title': title,
1008 'description': description,
1009 'timestamp': timestamp,
1010 'formats': formats,
1011 'subtitles': subtitles,
1012 }
a3bfddfa 1013
3721515b 1014 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1015 initial_data = self._parse_json(self._html_search_regex(
1016 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1017 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1018 if initial_data:
1019 init_data = try_get(
1020 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1021 smp_data = init_data.get('smpData') or {}
1022 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1023 version_id = clip_data.get('versionID')
1024 if version_id:
1025 title = smp_data['title']
1026 formats, subtitles = self._download_media_selector(version_id)
1027 self._sort_formats(formats)
1028 image_url = smp_data.get('holdingImageURL')
1029 display_date = init_data.get('displayDate')
1030 topic_title = init_data.get('topicTitle')
1031
1032 return {
1033 'id': version_id,
1034 'title': title,
1035 'formats': formats,
1036 'alt_title': init_data.get('shortTitle'),
1037 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1038 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1039 'upload_date': display_date.replace('-', '') if display_date else None,
1040 'subtitles': subtitles,
1041 'duration': int_or_none(clip_data.get('duration')),
1042 'categories': [topic_title] if topic_title else None,
1043 }
1044
9fb64c04
S
1045 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1046 # There are several setPayload calls may be present but the video
1047 # seems to be always related to the first one
1048 morph_payload = self._parse_json(
1049 self._search_regex(
1050 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1051 webpage, 'morph payload', default='{}'),
1052 playlist_id, fatal=False)
1053 if morph_payload:
1054 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1055 for component in components:
1056 if not isinstance(component, dict):
1057 continue
1058 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1059 if not lead_media:
1060 continue
1061 identifiers = lead_media.get('identifiers')
1062 if not identifiers or not isinstance(identifiers, dict):
1063 continue
1064 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1065 if not programme_id:
1066 continue
1067 title = lead_media.get('title') or self._og_search_title(webpage)
1068 formats, subtitles = self._download_media_selector(programme_id)
1069 self._sort_formats(formats)
1070 description = lead_media.get('summary')
1071 uploader = lead_media.get('masterBrand')
1072 uploader_id = lead_media.get('mid')
1073 duration = None
1074 duration_d = lead_media.get('duration')
1075 if isinstance(duration_d, dict):
1076 duration = parse_duration(dict_get(
1077 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1078 return {
1079 'id': programme_id,
1080 'title': title,
1081 'description': description,
1082 'duration': duration,
1083 'uploader': uploader,
1084 'uploader_id': uploader_id,
1085 'formats': formats,
1086 'subtitles': subtitles,
1087 }
1088
b96b4be4
RA
1089 preload_state = self._parse_json(self._search_regex(
1090 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1091 'preload state', default='{}'), playlist_id, fatal=False)
1092 if preload_state:
1093 current_programme = preload_state.get('programmes', {}).get('current') or {}
1094 programme_id = current_programme.get('id')
1095 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1096 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1097 formats, subtitles = self._download_media_selector(programme_id)
1098 self._sort_formats(formats)
1099 synopses = current_programme.get('synopses') or {}
1100 network = current_programme.get('network') or {}
1101 duration = int_or_none(
1102 current_programme.get('duration', {}).get('value'))
1103 thumbnail = None
1104 image_url = current_programme.get('image_url')
1105 if image_url:
3721515b 1106 thumbnail = image_url.replace('{recipe}', 'raw')
b96b4be4
RA
1107 return {
1108 'id': programme_id,
1109 'title': title,
1110 'description': dict_get(synopses, ('long', 'medium', 'short')),
1111 'thumbnail': thumbnail,
1112 'duration': duration,
1113 'uploader': network.get('short_title'),
1114 'uploader_id': network.get('id'),
1115 'formats': formats,
1116 'subtitles': subtitles,
1117 }
1118
6d155707
S
1119 bbc3_config = self._parse_json(
1120 self._search_regex(
1121 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1122 'bbcthree config', default='{}'),
38d70284 1123 playlist_id, transform_source=js_to_json, fatal=False) or {}
1124 payload = bbc3_config.get('payload') or {}
1125 if payload:
1126 clip = payload.get('currentClip') or {}
1127 clip_vpid = clip.get('vpid')
1128 clip_title = clip.get('title')
1129 if clip_vpid and clip_title:
1130 formats, subtitles = self._download_media_selector(clip_vpid)
1131 self._sort_formats(formats)
1132 return {
1133 'id': clip_vpid,
1134 'title': clip_title,
1135 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1136 'description': clip.get('description'),
1137 'duration': parse_duration(clip.get('duration')),
1138 'formats': formats,
1139 'subtitles': subtitles,
1140 }
6d155707 1141 bbc3_playlist = try_get(
38d70284 1142 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1143 dict)
1144 if bbc3_playlist:
1145 playlist_title = bbc3_playlist.get('title') or playlist_title
1146 thumbnail = bbc3_playlist.get('holdingImageURL')
1147 entries = []
1148 for bbc3_item in bbc3_playlist['items']:
1149 programme_id = bbc3_item.get('versionID')
1150 if not programme_id:
1151 continue
1152 formats, subtitles = self._download_media_selector(programme_id)
1153 self._sort_formats(formats)
1154 entries.append({
1155 'id': programme_id,
1156 'title': playlist_title,
1157 'thumbnail': thumbnail,
1158 'timestamp': timestamp,
1159 'formats': formats,
1160 'subtitles': subtitles,
1161 })
1162 return self.playlist_result(
1163 entries, playlist_id, playlist_title, playlist_description)
1164
38d70284 1165 initial_data = self._parse_json(self._search_regex(
1166 r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
1167 'preload state', default='{}'), playlist_id, fatal=False)
1168 if initial_data:
1169 def parse_media(media):
1170 if not media:
1171 return
1172 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1173 item_id = item.get('id')
1174 item_title = item.get('title')
1175 if not (item_id and item_title):
1176 continue
1177 formats, subtitles = self._download_media_selector(item_id)
1178 self._sort_formats(formats)
1bdae7d3 1179 item_desc = None
1180 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1181 if blocks:
1182 summary = []
1183 for block in blocks:
1184 text = try_get(block, lambda x: x['model']['text'], compat_str)
1185 if text:
1186 summary.append(text)
1187 if summary:
1188 item_desc = '\n\n'.join(summary)
1189 item_time = None
1190 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1191 if try_get(meta, lambda x: x['label']) == 'Published':
1192 item_time = unified_timestamp(meta.get('timestamp'))
1193 break
38d70284 1194 entries.append({
1195 'id': item_id,
1196 'title': item_title,
1197 'thumbnail': item.get('holdingImageUrl'),
1198 'formats': formats,
1199 'subtitles': subtitles,
1bdae7d3 1200 'timestamp': item_time,
1201 'description': strip_or_none(item_desc),
38d70284 1202 })
1203 for resp in (initial_data.get('data') or {}).values():
1204 name = resp.get('name')
1205 if name == 'media-experience':
1206 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1207 elif name == 'article':
1208 for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
1209 if block.get('type') != 'media':
1210 continue
1211 parse_media(block.get('model'))
1212 return self.playlist_result(
1213 entries, playlist_id, playlist_title, playlist_description)
1214
88ed52ae
S
1215 def extract_all(pattern):
1216 return list(filter(None, map(
1217 lambda s: self._parse_json(s, playlist_id, fatal=False),
1218 re.findall(pattern, webpage))))
1219
1220 # Multiple video article (e.g.
1221 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1222 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1223 entries = []
1224 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1225 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1226 if embed_url and re.match(EMBED_URL, embed_url):
1227 entries.append(embed_url)
1228 entries.extend(re.findall(
1229 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1230 if entries:
1231 return self.playlist_result(
aaa42cf0 1232 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1233 playlist_id, playlist_title, playlist_description)
9afa1770
S
1234
1235 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1236 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1237
1238 if not medias:
1239 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1240 media_asset = self._search_regex(
1241 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1242 webpage, 'media asset', default=None)
1243 if media_asset:
1244 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1245 medias = []
1246 for video in media_asset_page.get('videos', {}).values():
1247 medias.extend(video.values())
1248
1249 if not medias:
1250 # Multiple video playlist with single `now playing` entry (e.g.
1251 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1252 vxp_playlist = self._parse_json(
9afa1770 1253 self._search_regex(
a346b1ff
S
1254 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1255 webpage, 'playlist data'),
9afa1770 1256 playlist_id)
a346b1ff
S
1257 playlist_medias = []
1258 for item in vxp_playlist:
1259 media = item.get('media')
1260 if not media:
1261 continue
1262 playlist_medias.append(media)
1263 # Download single video if found media with asset id matching the video id from URL
1264 if item.get('advert', {}).get('assetId') == playlist_id:
1265 medias = [media]
1266 break
1267 # Fallback to the whole playlist
1268 if not medias:
1269 medias = playlist_medias
9afa1770
S
1270
1271 entries = []
1272 for num, media_meta in enumerate(medias, start=1):
1273 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
a06916d9 1274 if not formats and not self.get_param('ignore_no_formats'):
9afa1770 1275 continue
10273d6e 1276 self._sort_formats(formats)
1277
9afa1770
S
1278 video_id = media_meta.get('externalId')
1279 if not video_id:
1280 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1281
1282 title = media_meta.get('caption')
1283 if not title:
1284 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1285
1286 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1287
9afa1770
S
1288 images = []
1289 for image in media_meta.get('images', {}).values():
1290 images.extend(image.values())
1291 if 'image' in media_meta:
1292 images.append(media_meta['image'])
1293
1294 thumbnails = [{
1295 'url': image.get('href'),
1296 'width': int_or_none(image.get('width')),
1297 'height': int_or_none(image.get('height')),
1298 } for image in images]
1299
1300 entries.append({
1301 'id': video_id,
10273d6e 1302 'title': title,
9afa1770 1303 'thumbnails': thumbnails,
10273d6e 1304 'duration': duration,
9afa1770 1305 'timestamp': timestamp,
10273d6e 1306 'formats': formats,
1307 'subtitles': subtitles,
a3bfddfa 1308 })
10273d6e 1309
9afa1770 1310 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1311
1312
1313class BBCCoUkArticleIE(InfoExtractor):
92519402 1314 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1315 IE_NAME = 'bbc.co.uk:article'
1316 IE_DESC = 'BBC articles'
1317
1318 _TEST = {
1319 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1320 'info_dict': {
1321 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1322 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1323 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1324 },
1325 'playlist_count': 4,
1326 'add_ie': ['BBCCoUk'],
1327 }
1328
1329 def _real_extract(self, url):
1330 playlist_id = self._match_id(url)
1331
1332 webpage = self._download_webpage(url, playlist_id)
1333
1334 title = self._og_search_title(webpage)
1335 description = self._og_search_description(webpage).strip()
1336
1337 entries = [self.url_result(programme_url) for programme_url in re.findall(
1338 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1339
1340 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1341
1342
1343class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1344 def _entries(self, webpage, url, playlist_id):
1345 single_page = 'page' in compat_urlparse.parse_qs(
1346 compat_urlparse.urlparse(url).query)
1347 for page_num in itertools.count(2):
1348 for video_id in re.findall(
1349 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1350 yield self.url_result(
1351 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1352 if single_page:
1353 return
1354 next_page = self._search_regex(
1355 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1356 webpage, 'next page url', default=None, group='url')
1357 if not next_page:
1358 break
1359 webpage = self._download_webpage(
1360 compat_urlparse.urljoin(url, next_page), playlist_id,
1361 'Downloading page %d' % page_num, page_num)
1362
ded7511a
S
1363 def _real_extract(self, url):
1364 playlist_id = self._match_id(url)
1365
1366 webpage = self._download_webpage(url, playlist_id)
1367
ded7511a
S
1368 title, description = self._extract_title_and_description(webpage)
1369
254e64a2
S
1370 return self.playlist_result(
1371 self._entries(webpage, url, playlist_id),
1372 playlist_id, title, description)
ded7511a
S
1373
1374
1418a043 1375class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1376 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1377
1378 @staticmethod
1379 def _get_default(episode, key, default_key='default'):
1380 return try_get(episode, lambda x: x[key][default_key])
1381
1382 def _get_description(self, data):
1383 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1384 return dict_get(synopsis, ('large', 'medium', 'small'))
1385
1386 def _fetch_page(self, programme_id, per_page, series_id, page):
1387 elements = self._get_elements(self._call_api(
1388 programme_id, per_page, page + 1, series_id))
1389 for element in elements:
1390 episode = self._get_episode(element)
1391 episode_id = episode.get('id')
1392 if not episode_id:
1393 continue
1394 thumbnail = None
1395 image = self._get_episode_image(episode)
1396 if image:
1397 thumbnail = image.replace('{recipe}', 'raw')
1398 category = self._get_default(episode, 'labels', 'category')
1399 yield {
1400 '_type': 'url',
1401 'id': episode_id,
1402 'title': self._get_episode_field(episode, 'subtitle'),
1403 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1404 'thumbnail': thumbnail,
1405 'description': self._get_description(episode),
1406 'categories': [category] if category else None,
1407 'series': self._get_episode_field(episode, 'title'),
1408 'ie_key': BBCCoUkIE.ie_key(),
1409 }
1410
1411 def _real_extract(self, url):
1412 pid = self._match_id(url)
1413 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1414 series_id = qs.get('seriesId', [None])[0]
1415 page = qs.get('page', [None])[0]
1416 per_page = 36 if page else self._PAGE_SIZE
1417 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1418 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1419 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1420 return self.playlist_result(
1421 entries, pid, self._get_playlist_title(playlist_data),
1422 self._get_description(playlist_data))
1423
1424
1425class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1426 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1427 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1428 _TESTS = [{
ded7511a
S
1429 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1430 'info_dict': {
1431 'id': 'b05rcz9v',
1432 'title': 'The Disappearance',
1418a043 1433 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1434 },
1418a043 1435 'playlist_mincount': 8,
9158af16 1436 }, {
1418a043 1437 # all seasons
1438 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1439 'info_dict': {
1440 'id': 'b094m5t9',
1441 'title': 'Doctor Foster',
1442 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1443 },
1444 'playlist_mincount': 10,
1445 }, {
1446 # explicit season
1447 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1448 'info_dict': {
1449 'id': 'b094m5t9',
1450 'title': 'Doctor Foster',
1451 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1452 },
1453 'playlist_mincount': 5,
1454 }, {
1455 # all pages
1456 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1457 'info_dict': {
1458 'id': 'm0004c4v',
1459 'title': 'Beechgrove',
1460 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1461 },
1462 'playlist_mincount': 37,
1463 }, {
1464 # explicit page
1465 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1466 'info_dict': {
1467 'id': 'm0004c4v',
1468 'title': 'Beechgrove',
1469 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1470 },
1471 'playlist_mincount': 1,
1472 }]
1473 _PAGE_SIZE = 100
1474 _DESCRIPTION_KEY = 'synopsis'
1475
1476 def _get_episode_image(self, episode):
1477 return self._get_default(episode, 'image')
1478
1479 def _get_episode_field(self, episode, field):
1480 return self._get_default(episode, field)
1481
1482 @staticmethod
1483 def _get_elements(data):
1484 return data['entities']['results']
1485
1486 @staticmethod
1487 def _get_episode(element):
1488 return element.get('episode') or {}
1489
1490 def _call_api(self, pid, per_page, page=1, series_id=None):
1491 variables = {
1492 'id': pid,
1493 'page': page,
1494 'perPage': per_page,
1495 }
1496 if series_id:
1497 variables['sliceId'] = series_id
1498 return self._download_json(
1499 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1500 'Content-Type': 'application/json'
1501 }, data=json.dumps({
1502 'id': '5692d93d5aac8d796a0305e895e61551',
1503 'variables': variables,
1504 }).encode('utf-8'))['data']['programme']
1505
1506 @staticmethod
1507 def _get_playlist_data(data):
1508 return data
1509
1510 def _get_playlist_title(self, data):
1511 return self._get_default(data, 'title')
1512
1513
1514class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1515 IE_NAME = 'bbc.co.uk:iplayer:group'
1516 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1517 _TESTS = [{
9158af16
S
1518 # Available for over a year unlike 30 days for most other programmes
1519 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1520 'info_dict': {
1521 'id': 'p02tcc32',
1522 'title': 'Bohemian Icons',
1523 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1524 },
1525 'playlist_mincount': 10,
1418a043 1526 }, {
1527 # all pages
1528 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1529 'info_dict': {
1530 'id': 'p081d7j7',
1531 'title': 'Music in Scotland',
1532 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1533 },
1534 'playlist_mincount': 47,
1535 }, {
1536 # explicit page
1537 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1538 'info_dict': {
1539 'id': 'p081d7j7',
1540 'title': 'Music in Scotland',
1541 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1542 },
1543 'playlist_mincount': 11,
9158af16 1544 }]
1418a043 1545 _PAGE_SIZE = 200
1546 _DESCRIPTION_KEY = 'synopses'
1547
1548 def _get_episode_image(self, episode):
1549 return self._get_default(episode, 'images', 'standard')
1550
1551 def _get_episode_field(self, episode, field):
1552 return episode.get(field)
1553
1554 @staticmethod
1555 def _get_elements(data):
1556 return data['elements']
1557
1558 @staticmethod
1559 def _get_episode(element):
1560 return element
1561
1562 def _call_api(self, pid, per_page, page=1, series_id=None):
1563 return self._download_json(
1564 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1565 pid, query={
1566 'page': page,
1567 'per_page': per_page,
1568 })['group_episodes']
1569
1570 @staticmethod
1571 def _get_playlist_data(data):
1572 return data['group']
ded7511a 1573
1418a043 1574 def _get_playlist_title(self, data):
1575 return data.get('title')
ded7511a
S
1576
1577
1578class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1579 IE_NAME = 'bbc.co.uk:playlist'
1580 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1581 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1582 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1583 _TESTS = [{
1584 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1585 'info_dict': {
1586 'id': 'b05rcz9v',
1587 'title': 'The Disappearance - Clips - BBC Four',
1588 'description': 'French thriller serial about a missing teenager.',
1589 },
1590 'playlist_mincount': 7,
4f640f28
S
1591 }, {
1592 # multipage playlist, explicit page
1593 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1594 'info_dict': {
1595 'id': 'b00mfl7n',
1596 'title': 'Frozen Planet - Clips - BBC One',
1597 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1598 },
1599 'playlist_mincount': 24,
1600 }, {
1601 # multipage playlist, all pages
1602 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1603 'info_dict': {
1604 'id': 'b00mfl7n',
1605 'title': 'Frozen Planet - Clips - BBC One',
1606 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1607 },
1608 'playlist_mincount': 142,
ded7511a
S
1609 }, {
1610 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1611 'only_matching': True,
1612 }, {
1613 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1614 'only_matching': True,
1615 }, {
1616 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1617 'only_matching': True,
1618 }]
1619
1620 def _extract_title_and_description(self, webpage):
1621 title = self._og_search_title(webpage, fatal=False)
1622 description = self._og_search_description(webpage)
1623 return title, description