]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
pull changes from remote master (#190)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
254e64a2 4import itertools
f0228f56 5import re
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
6d155707 15 js_to_json,
9afa1770
S
16 parse_duration,
17 parse_iso8601,
9fb64c04 18 try_get,
dab062fb 19 unescapeHTML,
f0228f56 20 url_or_none,
97067db2
S
21 urlencode_postdata,
22 urljoin,
8683b4d8 23)
36e6f62c 24from ..compat import (
ee0ba927 25 compat_etree_Element,
36e6f62c 26 compat_HTTPError,
254e64a2 27 compat_urlparse,
36e6f62c 28)
082c6c86 29
d12a1a47 30
f13b1e7d 31class BBCCoUkIE(InfoExtractor):
082c6c86 32 IE_NAME = 'bbc.co.uk'
2e3fd9ec 33 IE_DESC = 'BBC iPlayer'
6f356cbb 34 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
35 _VALID_URL = r'''(?x)
36 https?://
37 (?:www\.)?bbc\.co\.uk/
38 (?:
39 programmes/(?!articles/)|
40 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 41 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 42 radio/player/|
b72305f0 43 sounds/play/|
d3d45e0a 44 events/[^/]+/play/[^/]+/
f20a11ed 45 )
ded7511a 46 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 47 ''' % _ID_REGEX
082c6c86 48
97067db2
S
49 _LOGIN_URL = 'https://account.bbc.com/signin'
50 _NETRC_MACHINE = 'bbc'
51
d12a1a47 52 _MEDIASELECTOR_URLS = [
26ccc68b
S
53 # Provides HQ HLS streams with even better quality that pc mediaset but fails
54 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 55 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 56 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
57 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
58 ]
a8b081a0 59
e6174ee9
S
60 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
61 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
62
63 _NAMESPACES = (
64 _MEDIASELECTION_NS,
65 _EMP_PLAYLIST_NS,
66 )
67
2e3fd9ec
S
68 _TESTS = [
69 {
f2d0fc68 70 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 71 'info_dict': {
f2d0fc68 72 'id': 'b039d07m',
b1ea6802 73 'ext': 'flv',
acc86c9a 74 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 75 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
76 },
77 'params': {
b1ea6802 78 # rtmp download
2e3fd9ec
S
79 'skip_download': True,
80 }
082c6c86 81 },
2e3fd9ec
S
82 {
83 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
84 'info_dict': {
85 'id': 'b00yng1d',
86 'ext': 'flv',
87 'title': 'The Man in Black: Series 3: The Printed Name',
88 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
89 'duration': 1800,
90 },
91 'params': {
92 # rtmp download
93 'skip_download': True,
c7f0177f
S
94 },
95 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
96 },
97 {
98 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
99 'info_dict': {
100 'id': 'b00yng1d',
101 'ext': 'flv',
17968e44 102 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 103 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 104 'duration': 5100,
2e3fd9ec
S
105 },
106 'params': {
107 # rtmp download
108 'skip_download': True,
109 },
b1ea6802 110 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
111 },
112 {
113 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
114 'info_dict': {
115 'id': 'b03k3pb7',
116 'ext': 'flv',
117 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
118 'description': '2. Invasion',
119 'duration': 3600,
120 },
121 'params': {
122 # rtmp download
123 'skip_download': True,
124 },
b1ea6802 125 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
126 }, {
127 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
128 'info_dict': {
129 'id': 'b04v209v',
130 'ext': 'flv',
131 'title': 'Pete Tong, The Essential New Tune Special',
132 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
133 'duration': 10800,
134 },
135 'params': {
136 # rtmp download
137 'skip_download': True,
a3ef0e1c
YCH
138 },
139 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 140 }, {
5aa535c3 141 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
142 'note': 'Audio',
143 'info_dict': {
5aa535c3 144 'id': 'p022h44j',
b1ea6802 145 'ext': 'flv',
5aa535c3
S
146 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
147 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
148 'duration': 227,
c7e67594
S
149 },
150 'params': {
b1ea6802 151 # rtmp download
c7e67594
S
152 'skip_download': True,
153 }
154 }, {
155 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
156 'note': 'Video',
157 'info_dict': {
158 'id': 'p025c103',
b1ea6802 159 'ext': 'flv',
c7e67594
S
160 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
161 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
162 'duration': 226,
163 },
164 'params': {
b1ea6802 165 # rtmp download
c7e67594
S
166 'skip_download': True,
167 }
e68ae99a
S
168 }, {
169 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
170 'info_dict': {
171 'id': 'p02n76xf',
172 'ext': 'flv',
173 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
174 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
175 'duration': 3540,
176 },
177 'params': {
178 # rtmp download
179 'skip_download': True,
180 },
b1ea6802 181 'skip': 'geolocation',
25fa8d66
YCH
182 }, {
183 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
184 'info_dict': {
185 'id': 'b05zmgw1',
186 'ext': 'flv',
187 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
188 'title': 'Royal Academy Summer Exhibition',
189 'duration': 3540,
190 },
191 'params': {
192 # rtmp download
193 'skip_download': True,
194 },
b1ea6802 195 'skip': 'geolocation',
54914380
S
196 }, {
197 # iptv-all mediaset fails with geolocation however there is no geo restriction
198 # for this programme at all
5aa535c3 199 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 200 'info_dict': {
5aa535c3 201 'id': 'b06rkms3',
54914380 202 'ext': 'flv',
5aa535c3
S
203 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
204 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
205 },
206 'params': {
207 # rtmp download
208 'skip_download': True,
209 },
b1ea6802 210 'skip': 'Now it\'s really geo-restricted',
1ac6e794 211 }, {
067aa17e 212 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
213 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
214 'info_dict': {
215 'id': 'p028bfkj',
b1ea6802 216 'ext': 'flv',
1ac6e794
S
217 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
218 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
219 },
220 'params': {
b1ea6802 221 # rtmp download
1ac6e794
S
222 'skip_download': True,
223 },
b72305f0
J
224 }, {
225 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
226 'note': 'Audio',
227 'info_dict': {
228 'id': 'm0007jz9',
229 'ext': 'mp4',
230 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
231 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
232 'duration': 9840,
233 },
234 'params': {
235 # rtmp download
236 'skip_download': True,
237 }
31763975
S
238 }, {
239 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
240 'only_matching': True,
c7e67594
S
241 }, {
242 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
243 'only_matching': True,
0692ef86
S
244 }, {
245 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
246 'only_matching': True,
f20a11ed
S
247 }, {
248 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
249 'only_matching': True,
72d256c4
S
250 }, {
251 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
252 'only_matching': True,
53647dfd
S
253 }, {
254 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
255 'only_matching': True,
6f356cbb
S
256 }, {
257 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
258 'only_matching': True,
259 }, {
260 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
261 'only_matching': True,
72d256c4 262 }]
2e3fd9ec 263
97eb9bd2
RA
264 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
265
97067db2
S
266 def _login(self):
267 username, password = self._get_login_info()
268 if username is None:
269 return
270
271 login_page = self._download_webpage(
272 self._LOGIN_URL, None, 'Downloading signin page')
273
274 login_form = self._hidden_inputs(login_page)
275
276 login_form.update({
277 'username': username,
278 'password': password,
279 })
280
281 post_url = urljoin(self._LOGIN_URL, self._search_regex(
282 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
283 'post url', default=self._LOGIN_URL, group='url'))
284
285 response, urlh = self._download_webpage_handle(
286 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
287 headers={'Referer': self._LOGIN_URL})
288
289 if self._LOGIN_URL in urlh.geturl():
290 error = clean_html(get_element_by_class('form-message', response))
291 if error:
292 raise ExtractorError(
293 'Unable to login: %s' % error, expected=True)
294 raise ExtractorError('Unable to log in')
295
296 def _real_initialize(self):
297 self._login()
298
d12a1a47
S
299 class MediaSelectionError(Exception):
300 def __init__(self, id):
301 self.id = id
302
2e3fd9ec
S
303 def _extract_asx_playlist(self, connection, programme_id):
304 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
305 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
306
2e3fd9ec 307 def _extract_items(self, playlist):
e6174ee9
S
308 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
309
310 def _findall_ns(self, element, xpath):
311 elements = []
312 for ns in self._NAMESPACES:
313 elements.extend(element.findall(xpath % ns))
314 return elements
2e3fd9ec
S
315
316 def _extract_medias(self, media_selection):
e6174ee9
S
317 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
318 if error is None:
319 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 320 if error is not None:
d12a1a47 321 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 322 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
323
324 def _extract_connections(self, media):
e6174ee9 325 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 326
f13b1e7d 327 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
328 subtitles = {}
329 for connection in self._extract_connections(media):
f0228f56
S
330 cc_url = url_or_none(connection.get('href'))
331 if not cc_url:
332 continue
333 captions = self._download_xml(
334 cc_url, programme_id, 'Downloading captions', fatal=False)
ee0ba927 335 if not isinstance(captions, compat_etree_Element):
f0228f56 336 continue
2e3fd9ec 337 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
338 subtitles[lang] = [
339 {
340 'url': connection.get('href'),
341 'ext': 'ttml',
342 },
f13b1e7d 343 ]
2e3fd9ec 344 return subtitles
082c6c86 345
d12a1a47
S
346 def _raise_extractor_error(self, media_selection_error):
347 raise ExtractorError(
348 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
349 expected=True)
350
c056efa2 351 def _download_media_selector(self, programme_id):
d12a1a47
S
352 last_exception = None
353 for mediaselector_url in self._MEDIASELECTOR_URLS:
354 try:
355 return self._download_media_selector_url(
356 mediaselector_url % programme_id, programme_id)
357 except BBCCoUkIE.MediaSelectionError as e:
d781e293 358 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
359 last_exception = e
360 continue
361 self._raise_extractor_error(e)
362 self._raise_extractor_error(last_exception)
9afa1770
S
363
364 def _download_media_selector_url(self, url, programme_id=None):
9283d4ea
S
365 media_selection = self._download_xml(
366 url, programme_id, 'Downloading media selection XML',
367 expected_status=(403, 404))
9afa1770 368 return self._process_media_selector(media_selection, programme_id)
082c6c86 369
9afa1770 370 def _process_media_selector(self, media_selection, programme_id):
082c6c86 371 formats = []
2e3fd9ec 372 subtitles = None
b0af1215 373 urls = []
2e3fd9ec 374
c056efa2
S
375 for media in self._extract_medias(media_selection):
376 kind = media.get('kind')
a7e5f274
RA
377 if kind in ('video', 'audio'):
378 bitrate = int_or_none(media.get('bitrate'))
379 encoding = media.get('encoding')
380 service = media.get('service')
381 width = int_or_none(media.get('width'))
382 height = int_or_none(media.get('height'))
383 file_size = int_or_none(media.get('media_file_size'))
384 for connection in self._extract_connections(media):
b0af1215
RA
385 href = connection.get('href')
386 if href in urls:
387 continue
388 if href:
389 urls.append(href)
a7e5f274
RA
390 conn_kind = connection.get('kind')
391 protocol = connection.get('protocol')
392 supplier = connection.get('supplier')
a7e5f274
RA
393 transfer_format = connection.get('transferFormat')
394 format_id = supplier or conn_kind or protocol
395 if service:
396 format_id = '%s_%s' % (service, format_id)
397 # ASX playlist
398 if supplier == 'asx':
399 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
400 formats.append({
401 'url': ref,
402 'format_id': 'ref%s_%s' % (i, format_id),
403 })
404 elif transfer_format == 'dash':
405 formats.extend(self._extract_mpd_formats(
406 href, programme_id, mpd_id=format_id, fatal=False))
407 elif transfer_format == 'hls':
408 formats.extend(self._extract_m3u8_formats(
409 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
410 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
411 if re.search(self._USP_RE, href):
412 usp_formats = self._extract_m3u8_formats(
6b2d8c91 413 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
97eb9bd2
RA
414 programme_id, ext='mp4', entry_protocol='m3u8_native',
415 m3u8_id=format_id, fatal=False)
416 for f in usp_formats:
417 if f.get('height') and f['height'] > 720:
418 continue
419 formats.append(f)
a7e5f274
RA
420 elif transfer_format == 'hds':
421 formats.extend(self._extract_f4m_formats(
422 href, programme_id, f4m_id=format_id, fatal=False))
423 else:
f9622868 424 if not service and not supplier and bitrate:
aaa42cf0 425 format_id += '-%d' % bitrate
a7e5f274
RA
426 fmt = {
427 'format_id': format_id,
428 'filesize': file_size,
429 }
430 if kind == 'video':
431 fmt.update({
432 'width': width,
433 'height': height,
6240925b 434 'tbr': bitrate,
a7e5f274
RA
435 'vcodec': encoding,
436 })
437 else:
438 fmt.update({
439 'abr': bitrate,
440 'acodec': encoding,
441 'vcodec': 'none',
442 })
1af959ef 443 if protocol in ('http', 'https'):
a7e5f274
RA
444 # Direct link
445 fmt.update({
446 'url': href,
447 })
448 elif protocol == 'rtmp':
449 application = connection.get('application', 'ondemand')
450 auth_string = connection.get('authString')
451 identifier = connection.get('identifier')
452 server = connection.get('server')
453 fmt.update({
454 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
455 'play_path': identifier,
456 'app': '%s?%s' % (application, auth_string),
457 'page_url': 'http://www.bbc.co.uk',
458 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
459 'rtmp_live': False,
460 'ext': 'flv',
461 })
964744af
S
462 else:
463 continue
a7e5f274 464 formats.append(fmt)
c056efa2 465 elif kind == 'captions':
f13b1e7d 466 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 467 return formats, subtitles
2e3fd9ec 468
ae6986fb
S
469 def _download_playlist(self, playlist_id):
470 try:
471 playlist = self._download_json(
472 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
473 playlist_id, 'Downloading playlist JSON')
474
475 version = playlist.get('defaultAvailableVersion')
476 if version:
477 smp_config = version['smpConfig']
478 title = smp_config['title']
479 description = smp_config['summary']
480 for item in smp_config['items']:
481 kind = item['kind']
40fcba5e 482 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
483 continue
484 programme_id = item.get('vpid')
d97f5cd7 485 duration = int_or_none(item.get('duration'))
ae6986fb
S
486 formats, subtitles = self._download_media_selector(programme_id)
487 return programme_id, title, description, duration, formats, subtitles
488 except ExtractorError as ee:
f813928e 489 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
490 raise
491
492 # fallback to legacy playlist
9afa1770
S
493 return self._process_legacy_playlist(playlist_id)
494
495 def _process_legacy_playlist_url(self, url, display_id):
496 playlist = self._download_legacy_playlist_url(url, display_id)
497 return self._extract_from_legacy_playlist(playlist, display_id)
498
499 def _process_legacy_playlist(self, playlist_id):
500 return self._process_legacy_playlist_url(
501 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
502
503 def _download_legacy_playlist_url(self, url, playlist_id=None):
504 return self._download_xml(
505 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 506
9afa1770 507 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 508 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
509 if no_items is not None:
510 reason = no_items.get('reason')
511 if reason == 'preAvailability':
512 msg = 'Episode %s is not yet available' % playlist_id
513 elif reason == 'postAvailability':
514 msg = 'Episode %s is no longer available' % playlist_id
515 elif reason == 'noMedia':
516 msg = 'Episode %s is not currently available' % playlist_id
517 else:
518 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
519 raise ExtractorError(msg, expected=True)
520
521 for item in self._extract_items(playlist):
522 kind = item.get('kind')
40fcba5e 523 if kind not in ('programme', 'radioProgramme'):
ae6986fb 524 continue
e6174ee9
S
525 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
526 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 527 description = description_el.text if description_el is not None else None
9afa1770
S
528
529 def get_programme_id(item):
530 def get_from_attributes(item):
b827ee92 531 for p in ('identifier', 'group'):
9afa1770
S
532 value = item.get(p)
533 if value and re.match(r'^[pb][\da-z]{7}$', value):
534 return value
535 get_from_attributes(item)
e6174ee9 536 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
537 if mediator is not None:
538 return get_from_attributes(mediator)
539
540 programme_id = get_programme_id(item)
d97f5cd7 541 duration = int_or_none(item.get('duration'))
e6174ee9
S
542
543 if programme_id:
544 formats, subtitles = self._download_media_selector(programme_id)
545 else:
546 formats, subtitles = self._process_media_selector(item, playlist_id)
547 programme_id = playlist_id
ae6986fb
S
548
549 return programme_id, title, description, duration, formats, subtitles
550
c056efa2
S
551 def _real_extract(self, url):
552 group_id = self._match_id(url)
553
554 webpage = self._download_webpage(url, group_id, 'Downloading video page')
555
b2ed954f
S
556 error = self._search_regex(
557 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
558 webpage, 'error', default=None)
559 if error:
560 raise ExtractorError(error, expected=True)
561
8683b4d8 562 programme_id = None
679bacf0 563 duration = None
8683b4d8
S
564
565 tviplayer = self._search_regex(
566 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
567 webpage, 'player', default=None)
568
569 if tviplayer:
570 player = self._parse_json(tviplayer, group_id).get('player', {})
571 duration = int_or_none(player.get('duration'))
572 programme_id = player.get('vpid')
573
574 if not programme_id:
575 programme_id = self._search_regex(
22d7368d 576 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 577
c056efa2 578 if programme_id:
c056efa2 579 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 580 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
581 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
582 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 583 description = self._search_regex(
a8534274
S
584 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
585 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
586 webpage, 'description', default=None)
587 if not description:
588 description = self._html_search_meta('description', webpage)
c056efa2 589 else:
ae6986fb 590 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 591
082c6c86
S
592 self._sort_formats(formats)
593
594 return {
2e3fd9ec 595 'id': programme_id,
082c6c86
S
596 'title': title,
597 'description': description,
650cfd0c 598 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
599 'duration': duration,
600 'formats': formats,
2e3fd9ec 601 'subtitles': subtitles,
5f6a1245 602 }
10273d6e 603
604
9afa1770
S
605class BBCIE(BBCCoUkIE):
606 IE_NAME = 'bbc'
607 IE_DESC = 'BBC'
608 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 609
d12a1a47 610 _MEDIASELECTOR_URLS = [
55ebae26
S
611 # Provides HQ HLS streams but fails with geolocation in some cases when it's
612 # even not geo restricted at all
613 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
614 # Provides more formats, namely direct mp4 links, but fails on some videos with
615 # notukerror for non UK (?) users (e.g.
616 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
617 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
618 # Provides fewer formats, but works everywhere for everybody (hopefully)
619 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
620 ]
10273d6e 621
622 _TESTS = [{
6a747190 623 # article with multiple videos embedded with data-playable containing vpids
10273d6e 624 'url': 'http://www.bbc.com/news/world-europe-32668511',
625 'info_dict': {
626 'id': 'world-europe-32668511',
acc86c9a 627 'title': 'Russia stages massive WW2 parade',
9afa1770 628 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 629 },
630 'playlist_count': 2,
a3bfddfa 631 }, {
6a747190 632 # article with multiple videos embedded with data-playable (more videos)
10273d6e 633 'url': 'http://www.bbc.com/news/business-28299555',
634 'info_dict': {
635 'id': 'business-28299555',
636 'title': 'Farnborough Airshow: Video highlights',
9afa1770 637 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 638 },
639 'playlist_count': 9,
9afa1770 640 'skip': 'Save time',
88ed52ae
S
641 }, {
642 # article with multiple videos embedded with `new SMP()`
6a747190 643 # broken
88ed52ae
S
644 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
645 'info_dict': {
646 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 647 'title': 'BUGGER',
88ed52ae
S
648 },
649 'playlist_count': 18,
a3bfddfa 650 }, {
6a747190 651 # single video embedded with data-playable containing vpid
10273d6e 652 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 653 'info_dict': {
654 'id': 'p02mprgb',
55ebae26 655 'ext': 'mp4',
10273d6e 656 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 657 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 658 'duration': 47,
9afa1770 659 'timestamp': 1427219242,
da92eeae 660 'upload_date': '20150324',
10273d6e 661 },
662 'params': {
9afa1770 663 # rtmp download
10273d6e 664 'skip_download': True,
665 }
a3bfddfa 666 }, {
6a747190
S
667 # article with single video embedded with data-playable containing XML playlist
668 # with direct video links as progressiveDownloadUrl (for now these are extracted)
669 # and playlist with f4m and m3u8 as streamingUrl
de939d89 670 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 671 'info_dict': {
9afa1770 672 'id': '150615_telabyad_kentin_cogu',
de939d89 673 'ext': 'mp4',
ad152e2d 674 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 675 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 676 'timestamp': 1434397334,
da92eeae 677 'upload_date': '20150615',
de939d89 678 },
679 'params': {
680 'skip_download': True,
681 }
c936d8cc 682 }, {
6a747190 683 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 684 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 685 'info_dict': {
9afa1770 686 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 687 'ext': 'mp4',
9afa1770 688 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 689 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 690 'timestamp': 1434713142,
da92eeae 691 'upload_date': '20150619',
de939d89 692 },
693 'params': {
694 'skip_download': True,
695 }
a346b1ff
S
696 }, {
697 # single video from video playlist embedded with vxp-playlist-data JSON
698 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
699 'info_dict': {
700 'id': 'p02w6qjc',
55ebae26 701 'ext': 'mp4',
a346b1ff
S
702 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
703 'duration': 56,
0bc4ee60 704 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
705 },
706 'params': {
707 'skip_download': True,
708 }
9afa1770
S
709 }, {
710 # single video story with digitalData
711 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
712 'info_dict': {
713 'id': 'p02q6gc4',
714 'ext': 'flv',
715 'title': 'Sri Lanka’s spicy secret',
716 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
717 'timestamp': 1437674293,
718 'upload_date': '20150723',
719 },
720 'params': {
721 # rtmp download
722 'skip_download': True,
723 }
724 }, {
725 # single video story without digitalData
726 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
727 'info_dict': {
728 'id': 'p018zqqg',
55ebae26 729 'ext': 'mp4',
9afa1770
S
730 'title': 'Hyundai Santa Fe Sport: Rock star',
731 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
732 'timestamp': 1415867444,
733 'upload_date': '20141113',
9afa1770
S
734 },
735 'params': {
736 # rtmp download
737 'skip_download': True,
738 }
9fb64c04
S
739 }, {
740 # single video embedded with Morph
741 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
742 'info_dict': {
743 'id': 'p041vhd0',
744 'ext': 'mp4',
745 'title': "Nigeria v Japan - Men's First Round",
746 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
747 'duration': 7980,
748 'uploader': 'BBC Sport',
749 'uploader_id': 'bbc_sport',
750 },
751 'params': {
752 # m3u8 download
753 'skip_download': True,
9fb64c04
S
754 },
755 'skip': 'Georestricted to UK',
9afa1770 756 }, {
6a747190 757 # single video with playlist.sxml URL in playlist param
9afa1770
S
758 'url': 'http://www.bbc.com/sport/0/football/33653409',
759 'info_dict': {
760 'id': 'p02xycnp',
55ebae26 761 'ext': 'mp4',
9afa1770 762 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 763 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
764 'duration': 140,
765 },
766 'params': {
767 # rtmp download
768 'skip_download': True,
769 }
b5d48cb1 770 }, {
6a747190 771 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
772 'url': 'http://www.bbc.com/sport/0/football/34475836',
773 'info_dict': {
774 'id': '34475836',
450b233c 775 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 776 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
777 },
778 'playlist_count': 3,
450b233c
S
779 }, {
780 # school report article with single video
781 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
782 'info_dict': {
783 'id': '35744779',
784 'title': 'School which breaks down barriers in Jerusalem',
785 },
786 'playlist_count': 1,
9afa1770
S
787 }, {
788 # single video with playlist URL from weather section
789 'url': 'http://www.bbc.com/weather/features/33601775',
790 'only_matching': True,
791 }, {
792 # custom redirection to www.bbc.com
793 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
794 'only_matching': True,
a1cf3e38
S
795 }, {
796 # single video article embedded with data-media-vpid
797 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
798 'only_matching': True,
6d155707
S
799 }, {
800 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
801 'info_dict': {
802 'id': 'p06556y7',
803 'ext': 'mp4',
804 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
805 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
806 },
807 'params': {
808 'skip_download': True,
809 }
b96b4be4
RA
810 }, {
811 # window.__PRELOADED_STATE__
812 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
813 'info_dict': {
814 'id': 'b0b9z4vz',
815 'ext': 'mp4',
816 'title': 'Prom 6: An American in Paris and Turangalila',
817 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
818 'uploader': 'Radio 3',
819 'uploader_id': 'bbc_radio_three',
820 },
373941c5
S
821 }, {
822 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
823 'info_dict': {
824 'id': 'p06w9tws',
825 'ext': 'mp4',
826 'title': 'md5:2fabf12a726603193a2879a055f72514',
827 'description': 'Learn English words and phrases from this story',
828 },
829 'add_ie': [BBCCoUkIE.ie_key()],
10273d6e 830 }]
831
9afa1770
S
832 @classmethod
833 def suitable(cls, url):
ded7511a
S
834 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
835 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
836 else super(BBCIE, cls).suitable(url))
9afa1770
S
837
838 def _extract_from_media_meta(self, media_meta, video_id):
839 # Direct links to media in media metadata (e.g.
840 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
841 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
842 source_files = media_meta.get('sourceFiles')
843 if source_files:
844 return [{
845 'url': f['url'],
846 'format_id': format_id,
847 'ext': f.get('encoding'),
848 'tbr': float_or_none(f.get('bitrate'), 1000),
849 'filesize': int_or_none(f.get('filesize')),
850 } for format_id, f in source_files.items() if f.get('url')], []
851
852 programme_id = media_meta.get('externalId')
853 if programme_id:
854 return self._download_media_selector(programme_id)
855
856 # Process playlist.sxml as legacy playlist
857 href = media_meta.get('href')
858 if href:
859 playlist = self._download_legacy_playlist_url(href)
860 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
861 return formats, subtitles
862
863 return [], []
864
baf39a1a
S
865 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
866 programme_id, title, description, duration, formats, subtitles = \
867 self._process_legacy_playlist_url(url, playlist_id)
868 self._sort_formats(formats)
869 return {
870 'id': programme_id,
871 'title': title,
872 'description': description,
873 'duration': duration,
874 'timestamp': timestamp,
875 'formats': formats,
876 'subtitles': subtitles,
877 }
878
10273d6e 879 def _real_extract(self, url):
9afa1770
S
880 playlist_id = self._match_id(url)
881
882 webpage = self._download_webpage(url, playlist_id)
883
522f6c06 884 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 885 timestamp = json_ld_info.get('timestamp')
0e832c2c 886
350e02d4 887 playlist_title = json_ld_info.get('title')
0e832c2c
S
888 if not playlist_title:
889 playlist_title = self._og_search_title(
890 webpage, default=None) or self._html_search_regex(
891 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
892 if playlist_title:
893 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
894
895 playlist_description = json_ld_info.get(
896 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
897
898 if not timestamp:
899 timestamp = parse_iso8601(self._search_regex(
900 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
901 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 902 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 903 webpage, 'date', default=None))
9afa1770 904
78f9d843
S
905 entries = []
906
de665713
S
907 # article with multiple videos embedded with playlist.sxml (e.g.
908 # http://www.bbc.com/sport/0/football/34475836)
909 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 910 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 911 if playlists:
baf39a1a
S
912 entries = [
913 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
914 for playlist_url in playlists]
de939d89 915
78f9d843
S
916 # news article with multiple videos embedded with data-playable
917 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
918 if data_playables:
919 for _, data_playable_json in data_playables:
920 data_playable = self._parse_json(
921 unescapeHTML(data_playable_json), playlist_id, fatal=False)
922 if not data_playable:
923 continue
baf39a1a
S
924 settings = data_playable.get('settings', {})
925 if settings:
78f9d843
S
926 # data-playable with video vpid in settings.playlistObject.items (e.g.
927 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
928 playlist_object = settings.get('playlistObject', {})
929 if playlist_object:
930 items = playlist_object.get('items')
931 if items and isinstance(items, list):
78f9d843
S
932 title = playlist_object['title']
933 description = playlist_object.get('summary')
baf39a1a
S
934 duration = int_or_none(items[0].get('duration'))
935 programme_id = items[0].get('vpid')
78f9d843
S
936 formats, subtitles = self._download_media_selector(programme_id)
937 self._sort_formats(formats)
938 entries.append({
939 'id': programme_id,
940 'title': title,
941 'description': description,
942 'timestamp': timestamp,
943 'duration': duration,
944 'formats': formats,
945 'subtitles': subtitles,
946 })
947 else:
948 # data-playable without vpid but with a playlist.sxml URLs
949 # in otherSettings.playlist (e.g.
950 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
951 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
952 if playlist:
a7e5f274
RA
953 entry = None
954 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
955 playlist_url = playlist.get('%sUrl' % key)
956 if not playlist_url:
957 continue
958 try:
a7e5f274
RA
959 info = self._extract_from_playlist_sxml(
960 playlist_url, playlist_id, timestamp)
961 if not entry:
962 entry = info
963 else:
964 entry['title'] = info['title']
965 entry['formats'].extend(info['formats'])
05087d1b
S
966 except Exception as e:
967 # Some playlist URL may fail with 500, at the same time
968 # the other one may work fine (e.g.
969 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
970 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
971 continue
972 raise
a7e5f274
RA
973 if entry:
974 self._sort_formats(entry['formats'])
975 entries.append(entry)
78f9d843
S
976
977 if entries:
78f9d843
S
978 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
979
373941c5
S
980 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
981 group_id = self._search_regex(
982 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
983 webpage, 'group id', default=None)
984 if playlist_id:
985 return self.url_result(
986 'https://www.bbc.co.uk/programmes/%s' % group_id,
987 ie=BBCCoUkIE.ie_key())
988
78f9d843
S
989 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
990 programme_id = self._search_regex(
a1cf3e38 991 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
992 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
993 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 994 webpage, 'vpid', default=None)
dab062fb 995
9afa1770
S
996 if programme_id:
997 formats, subtitles = self._download_media_selector(programme_id)
998 self._sort_formats(formats)
999 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1000 digital_data = self._parse_json(
1001 self._search_regex(
1002 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1003 programme_id, fatal=False)
1004 page_info = digital_data.get('page', {}).get('pageInfo', {})
1005 title = page_info.get('pageName') or self._og_search_title(webpage)
1006 description = page_info.get('description') or self._og_search_description(webpage)
1007 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1008 return {
1009 'id': programme_id,
1010 'title': title,
1011 'description': description,
1012 'timestamp': timestamp,
1013 'formats': formats,
1014 'subtitles': subtitles,
1015 }
a3bfddfa 1016
9fb64c04
S
1017 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1018 # There are several setPayload calls may be present but the video
1019 # seems to be always related to the first one
1020 morph_payload = self._parse_json(
1021 self._search_regex(
1022 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1023 webpage, 'morph payload', default='{}'),
1024 playlist_id, fatal=False)
1025 if morph_payload:
1026 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1027 for component in components:
1028 if not isinstance(component, dict):
1029 continue
1030 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1031 if not lead_media:
1032 continue
1033 identifiers = lead_media.get('identifiers')
1034 if not identifiers or not isinstance(identifiers, dict):
1035 continue
1036 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1037 if not programme_id:
1038 continue
1039 title = lead_media.get('title') or self._og_search_title(webpage)
1040 formats, subtitles = self._download_media_selector(programme_id)
1041 self._sort_formats(formats)
1042 description = lead_media.get('summary')
1043 uploader = lead_media.get('masterBrand')
1044 uploader_id = lead_media.get('mid')
1045 duration = None
1046 duration_d = lead_media.get('duration')
1047 if isinstance(duration_d, dict):
1048 duration = parse_duration(dict_get(
1049 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1050 return {
1051 'id': programme_id,
1052 'title': title,
1053 'description': description,
1054 'duration': duration,
1055 'uploader': uploader,
1056 'uploader_id': uploader_id,
1057 'formats': formats,
1058 'subtitles': subtitles,
1059 }
1060
b96b4be4
RA
1061 preload_state = self._parse_json(self._search_regex(
1062 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1063 'preload state', default='{}'), playlist_id, fatal=False)
1064 if preload_state:
1065 current_programme = preload_state.get('programmes', {}).get('current') or {}
1066 programme_id = current_programme.get('id')
1067 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1068 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1069 formats, subtitles = self._download_media_selector(programme_id)
1070 self._sort_formats(formats)
1071 synopses = current_programme.get('synopses') or {}
1072 network = current_programme.get('network') or {}
1073 duration = int_or_none(
1074 current_programme.get('duration', {}).get('value'))
1075 thumbnail = None
1076 image_url = current_programme.get('image_url')
1077 if image_url:
1078 thumbnail = image_url.replace('{recipe}', '1920x1920')
1079 return {
1080 'id': programme_id,
1081 'title': title,
1082 'description': dict_get(synopses, ('long', 'medium', 'short')),
1083 'thumbnail': thumbnail,
1084 'duration': duration,
1085 'uploader': network.get('short_title'),
1086 'uploader_id': network.get('id'),
1087 'formats': formats,
1088 'subtitles': subtitles,
1089 }
1090
6d155707
S
1091 bbc3_config = self._parse_json(
1092 self._search_regex(
1093 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1094 'bbcthree config', default='{}'),
1095 playlist_id, transform_source=js_to_json, fatal=False)
1096 if bbc3_config:
1097 bbc3_playlist = try_get(
1098 bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
1099 dict)
1100 if bbc3_playlist:
1101 playlist_title = bbc3_playlist.get('title') or playlist_title
1102 thumbnail = bbc3_playlist.get('holdingImageURL')
1103 entries = []
1104 for bbc3_item in bbc3_playlist['items']:
1105 programme_id = bbc3_item.get('versionID')
1106 if not programme_id:
1107 continue
1108 formats, subtitles = self._download_media_selector(programme_id)
1109 self._sort_formats(formats)
1110 entries.append({
1111 'id': programme_id,
1112 'title': playlist_title,
1113 'thumbnail': thumbnail,
1114 'timestamp': timestamp,
1115 'formats': formats,
1116 'subtitles': subtitles,
1117 })
1118 return self.playlist_result(
1119 entries, playlist_id, playlist_title, playlist_description)
1120
88ed52ae
S
1121 def extract_all(pattern):
1122 return list(filter(None, map(
1123 lambda s: self._parse_json(s, playlist_id, fatal=False),
1124 re.findall(pattern, webpage))))
1125
1126 # Multiple video article (e.g.
1127 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1128 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1129 entries = []
1130 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1131 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1132 if embed_url and re.match(EMBED_URL, embed_url):
1133 entries.append(embed_url)
1134 entries.extend(re.findall(
1135 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1136 if entries:
1137 return self.playlist_result(
aaa42cf0 1138 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1139 playlist_id, playlist_title, playlist_description)
9afa1770
S
1140
1141 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1142 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1143
1144 if not medias:
1145 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1146 media_asset = self._search_regex(
1147 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1148 webpage, 'media asset', default=None)
1149 if media_asset:
1150 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1151 medias = []
1152 for video in media_asset_page.get('videos', {}).values():
1153 medias.extend(video.values())
1154
1155 if not medias:
1156 # Multiple video playlist with single `now playing` entry (e.g.
1157 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1158 vxp_playlist = self._parse_json(
9afa1770 1159 self._search_regex(
a346b1ff
S
1160 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1161 webpage, 'playlist data'),
9afa1770 1162 playlist_id)
a346b1ff
S
1163 playlist_medias = []
1164 for item in vxp_playlist:
1165 media = item.get('media')
1166 if not media:
1167 continue
1168 playlist_medias.append(media)
1169 # Download single video if found media with asset id matching the video id from URL
1170 if item.get('advert', {}).get('assetId') == playlist_id:
1171 medias = [media]
1172 break
1173 # Fallback to the whole playlist
1174 if not medias:
1175 medias = playlist_medias
9afa1770
S
1176
1177 entries = []
1178 for num, media_meta in enumerate(medias, start=1):
1179 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1180 if not formats:
1181 continue
10273d6e 1182 self._sort_formats(formats)
1183
9afa1770
S
1184 video_id = media_meta.get('externalId')
1185 if not video_id:
1186 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1187
1188 title = media_meta.get('caption')
1189 if not title:
1190 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1191
1192 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1193
9afa1770
S
1194 images = []
1195 for image in media_meta.get('images', {}).values():
1196 images.extend(image.values())
1197 if 'image' in media_meta:
1198 images.append(media_meta['image'])
1199
1200 thumbnails = [{
1201 'url': image.get('href'),
1202 'width': int_or_none(image.get('width')),
1203 'height': int_or_none(image.get('height')),
1204 } for image in images]
1205
1206 entries.append({
1207 'id': video_id,
10273d6e 1208 'title': title,
9afa1770 1209 'thumbnails': thumbnails,
10273d6e 1210 'duration': duration,
9afa1770 1211 'timestamp': timestamp,
10273d6e 1212 'formats': formats,
1213 'subtitles': subtitles,
a3bfddfa 1214 })
10273d6e 1215
9afa1770 1216 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1217
1218
1219class BBCCoUkArticleIE(InfoExtractor):
92519402 1220 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1221 IE_NAME = 'bbc.co.uk:article'
1222 IE_DESC = 'BBC articles'
1223
1224 _TEST = {
1225 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1226 'info_dict': {
1227 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1228 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1229 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1230 },
1231 'playlist_count': 4,
1232 'add_ie': ['BBCCoUk'],
1233 }
1234
1235 def _real_extract(self, url):
1236 playlist_id = self._match_id(url)
1237
1238 webpage = self._download_webpage(url, playlist_id)
1239
1240 title = self._og_search_title(webpage)
1241 description = self._og_search_description(webpage).strip()
1242
1243 entries = [self.url_result(programme_url) for programme_url in re.findall(
1244 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1245
1246 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1247
1248
1249class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1250 def _entries(self, webpage, url, playlist_id):
1251 single_page = 'page' in compat_urlparse.parse_qs(
1252 compat_urlparse.urlparse(url).query)
1253 for page_num in itertools.count(2):
1254 for video_id in re.findall(
1255 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1256 yield self.url_result(
1257 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1258 if single_page:
1259 return
1260 next_page = self._search_regex(
1261 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1262 webpage, 'next page url', default=None, group='url')
1263 if not next_page:
1264 break
1265 webpage = self._download_webpage(
1266 compat_urlparse.urljoin(url, next_page), playlist_id,
1267 'Downloading page %d' % page_num, page_num)
1268
ded7511a
S
1269 def _real_extract(self, url):
1270 playlist_id = self._match_id(url)
1271
1272 webpage = self._download_webpage(url, playlist_id)
1273
ded7511a
S
1274 title, description = self._extract_title_and_description(webpage)
1275
254e64a2
S
1276 return self.playlist_result(
1277 self._entries(webpage, url, playlist_id),
1278 playlist_id, title, description)
ded7511a
S
1279
1280
1281class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1282 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1283 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1284 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1285 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1286 _TESTS = [{
ded7511a
S
1287 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1288 'info_dict': {
1289 'id': 'b05rcz9v',
1290 'title': 'The Disappearance',
1291 'description': 'French thriller serial about a missing teenager.',
1292 },
1293 'playlist_mincount': 6,
c6668e4a 1294 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1295 }, {
1296 # Available for over a year unlike 30 days for most other programmes
1297 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1298 'info_dict': {
1299 'id': 'p02tcc32',
1300 'title': 'Bohemian Icons',
1301 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1302 },
1303 'playlist_mincount': 10,
1304 }]
ded7511a
S
1305
1306 def _extract_title_and_description(self, webpage):
1307 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1308 description = self._search_regex(
1309 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1310 webpage, 'description', fatal=False, group='value')
1311 return title, description
1312
1313
1314class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1315 IE_NAME = 'bbc.co.uk:playlist'
1316 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1317 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1318 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1319 _TESTS = [{
1320 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1321 'info_dict': {
1322 'id': 'b05rcz9v',
1323 'title': 'The Disappearance - Clips - BBC Four',
1324 'description': 'French thriller serial about a missing teenager.',
1325 },
1326 'playlist_mincount': 7,
4f640f28
S
1327 }, {
1328 # multipage playlist, explicit page
1329 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1330 'info_dict': {
1331 'id': 'b00mfl7n',
1332 'title': 'Frozen Planet - Clips - BBC One',
1333 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1334 },
1335 'playlist_mincount': 24,
1336 }, {
1337 # multipage playlist, all pages
1338 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1339 'info_dict': {
1340 'id': 'b00mfl7n',
1341 'title': 'Frozen Planet - Clips - BBC One',
1342 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1343 },
1344 'playlist_mincount': 142,
ded7511a
S
1345 }, {
1346 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1347 'only_matching': True,
1348 }, {
1349 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1350 'only_matching': True,
1351 }, {
1352 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1353 'only_matching': True,
1354 }]
1355
1356 def _extract_title_and_description(self, webpage):
1357 title = self._og_search_title(webpage, fatal=False)
1358 description = self._og_search_description(webpage)
1359 return title, description