]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[bbc] Fix BBC Extractor to work with 'School Report'
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
082c6c86 5
f13b1e7d 6from .common import InfoExtractor
8683b4d8
S
7from ..utils import (
8 ExtractorError,
9afa1770 9 float_or_none,
8683b4d8 10 int_or_none,
9afa1770
S
11 parse_duration,
12 parse_iso8601,
dab062fb 13 unescapeHTML,
8683b4d8 14)
36e6f62c
JMF
15from ..compat import (
16 compat_etree_fromstring,
17 compat_HTTPError,
18)
082c6c86 19
d12a1a47 20
f13b1e7d 21class BBCCoUkIE(InfoExtractor):
082c6c86 22 IE_NAME = 'bbc.co.uk'
2e3fd9ec 23 IE_DESC = 'BBC iPlayer'
22d7368d 24 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
25 _VALID_URL = r'''(?x)
26 https?://
27 (?:www\.)?bbc\.co\.uk/
28 (?:
29 programmes/(?!articles/)|
30 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
31 music/clips[/#]|
32 radio/player/
33 )
34 (?P<id>%s)
35 ''' % _ID_REGEX
082c6c86 36
d12a1a47 37 _MEDIASELECTOR_URLS = [
26ccc68b
S
38 # Provides HQ HLS streams with even better quality that pc mediaset but fails
39 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 40 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 41 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
42 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
43 ]
a8b081a0 44
e6174ee9
S
45 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
46 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
47
48 _NAMESPACES = (
49 _MEDIASELECTION_NS,
50 _EMP_PLAYLIST_NS,
51 )
52
2e3fd9ec
S
53 _TESTS = [
54 {
f2d0fc68 55 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 56 'info_dict': {
f2d0fc68 57 'id': 'b039d07m',
2e3fd9ec 58 'ext': 'flv',
679bacf0 59 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 60 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
61 },
62 'params': {
63 # rtmp download
64 'skip_download': True,
65 }
082c6c86 66 },
2e3fd9ec
S
67 {
68 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
69 'info_dict': {
70 'id': 'b00yng1d',
71 'ext': 'flv',
72 'title': 'The Man in Black: Series 3: The Printed Name',
73 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
74 'duration': 1800,
75 },
76 'params': {
77 # rtmp download
78 'skip_download': True,
c7f0177f
S
79 },
80 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
81 },
82 {
83 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
84 'info_dict': {
85 'id': 'b00yng1d',
86 'ext': 'flv',
17968e44 87 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 88 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 89 'duration': 5100,
2e3fd9ec
S
90 },
91 'params': {
92 # rtmp download
93 'skip_download': True,
94 },
95 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
96 },
97 {
98 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
99 'info_dict': {
100 'id': 'b03k3pb7',
101 'ext': 'flv',
102 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
103 'description': '2. Invasion',
104 'duration': 3600,
105 },
106 'params': {
107 # rtmp download
108 'skip_download': True,
109 },
110 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
111 }, {
112 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
113 'info_dict': {
114 'id': 'b04v209v',
115 'ext': 'flv',
116 'title': 'Pete Tong, The Essential New Tune Special',
117 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
118 'duration': 10800,
119 },
120 'params': {
121 # rtmp download
122 'skip_download': True,
a3ef0e1c
YCH
123 },
124 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 125 }, {
5aa535c3 126 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
127 'note': 'Audio',
128 'info_dict': {
5aa535c3 129 'id': 'p022h44j',
c7e67594 130 'ext': 'flv',
5aa535c3
S
131 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
132 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
133 'duration': 227,
c7e67594
S
134 },
135 'params': {
136 # rtmp download
137 'skip_download': True,
138 }
139 }, {
140 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
141 'note': 'Video',
142 'info_dict': {
143 'id': 'p025c103',
144 'ext': 'flv',
145 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
146 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
147 'duration': 226,
148 },
149 'params': {
150 # rtmp download
151 'skip_download': True,
152 }
e68ae99a
S
153 }, {
154 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
155 'info_dict': {
156 'id': 'p02n76xf',
157 'ext': 'flv',
158 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
159 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
160 'duration': 3540,
161 },
162 'params': {
163 # rtmp download
164 'skip_download': True,
165 },
166 'skip': 'geolocation',
25fa8d66
YCH
167 }, {
168 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
169 'info_dict': {
170 'id': 'b05zmgw1',
171 'ext': 'flv',
172 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
173 'title': 'Royal Academy Summer Exhibition',
174 'duration': 3540,
175 },
176 'params': {
177 # rtmp download
178 'skip_download': True,
179 },
180 'skip': 'geolocation',
54914380
S
181 }, {
182 # iptv-all mediaset fails with geolocation however there is no geo restriction
183 # for this programme at all
5aa535c3 184 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 185 'info_dict': {
5aa535c3 186 'id': 'b06rkms3',
54914380 187 'ext': 'flv',
5aa535c3
S
188 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
189 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
190 },
191 'params': {
192 # rtmp download
193 'skip_download': True,
194 },
1ac6e794
S
195 }, {
196 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
197 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
198 'info_dict': {
199 'id': 'p028bfkj',
200 'ext': 'flv',
201 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
202 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
203 },
204 'params': {
205 # rtmp download
206 'skip_download': True,
207 },
31763975
S
208 }, {
209 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
210 'only_matching': True,
c7e67594
S
211 }, {
212 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
213 'only_matching': True,
0692ef86
S
214 }, {
215 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
216 'only_matching': True,
f20a11ed
S
217 }, {
218 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
219 'only_matching': True,
ae6986fb 220 }
2e3fd9ec
S
221 ]
222
d12a1a47
S
223 class MediaSelectionError(Exception):
224 def __init__(self, id):
225 self.id = id
226
2e3fd9ec
S
227 def _extract_asx_playlist(self, connection, programme_id):
228 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
229 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
230
231 def _extract_connection(self, connection, programme_id):
232 formats = []
e6174ee9 233 kind = connection.get('kind')
2e3fd9ec
S
234 protocol = connection.get('protocol')
235 supplier = connection.get('supplier')
236 if protocol == 'http':
237 href = connection.get('href')
7a896817 238 transfer_format = connection.get('transferFormat')
2e3fd9ec
S
239 # ASX playlist
240 if supplier == 'asx':
241 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
242 formats.append({
243 'url': ref,
244 'format_id': 'ref%s_%s' % (i, supplier),
245 })
7a896817
S
246 # Skip DASH until supported
247 elif transfer_format == 'dash':
248 pass
d1c694ea 249 elif transfer_format == 'hls':
7e5edcfd 250 formats.extend(self._extract_m3u8_formats(
d1c694ea 251 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
7e5edcfd 252 m3u8_id=supplier, fatal=False))
2e3fd9ec
S
253 # Direct link
254 else:
255 formats.append({
256 'url': href,
e6174ee9 257 'format_id': supplier or kind or protocol,
2e3fd9ec
S
258 })
259 elif protocol == 'rtmp':
260 application = connection.get('application', 'ondemand')
261 auth_string = connection.get('authString')
262 identifier = connection.get('identifier')
263 server = connection.get('server')
264 formats.append({
265 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
266 'play_path': identifier,
267 'app': '%s?%s' % (application, auth_string),
268 'page_url': 'http://www.bbc.co.uk',
269 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
270 'rtmp_live': False,
271 'ext': 'flv',
272 'format_id': supplier,
273 })
274 return formats
275
276 def _extract_items(self, playlist):
e6174ee9
S
277 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
278
279 def _findall_ns(self, element, xpath):
280 elements = []
281 for ns in self._NAMESPACES:
282 elements.extend(element.findall(xpath % ns))
283 return elements
2e3fd9ec
S
284
285 def _extract_medias(self, media_selection):
e6174ee9
S
286 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
287 if error is None:
288 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 289 if error is not None:
d12a1a47 290 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 291 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
292
293 def _extract_connections(self, media):
e6174ee9 294 return self._findall_ns(media, './{%s}connection')
2e3fd9ec
S
295
296 def _extract_video(self, media, programme_id):
297 formats = []
d97f5cd7 298 vbr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
299 vcodec = media.get('encoding')
300 service = media.get('service')
d97f5cd7
S
301 width = int_or_none(media.get('width'))
302 height = int_or_none(media.get('height'))
303 file_size = int_or_none(media.get('media_file_size'))
2e3fd9ec
S
304 for connection in self._extract_connections(media):
305 conn_formats = self._extract_connection(connection, programme_id)
306 for format in conn_formats:
307 format.update({
2e3fd9ec
S
308 'width': width,
309 'height': height,
310 'vbr': vbr,
311 'vcodec': vcodec,
312 'filesize': file_size,
313 })
e6174ee9
S
314 if service:
315 format['format_id'] = '%s_%s' % (service, format['format_id'])
2e3fd9ec
S
316 formats.extend(conn_formats)
317 return formats
318
319 def _extract_audio(self, media, programme_id):
320 formats = []
d97f5cd7 321 abr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
322 acodec = media.get('encoding')
323 service = media.get('service')
324 for connection in self._extract_connections(media):
325 conn_formats = self._extract_connection(connection, programme_id)
326 for format in conn_formats:
327 format.update({
328 'format_id': '%s_%s' % (service, format['format_id']),
329 'abr': abr,
330 'acodec': acodec,
331 })
332 formats.extend(conn_formats)
333 return formats
334
f13b1e7d 335 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
336 subtitles = {}
337 for connection in self._extract_connections(media):
338 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
339 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
340 subtitles[lang] = [
341 {
342 'url': connection.get('href'),
343 'ext': 'ttml',
344 },
f13b1e7d 345 ]
2e3fd9ec 346 return subtitles
082c6c86 347
d12a1a47
S
348 def _raise_extractor_error(self, media_selection_error):
349 raise ExtractorError(
350 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
351 expected=True)
352
c056efa2 353 def _download_media_selector(self, programme_id):
d12a1a47
S
354 last_exception = None
355 for mediaselector_url in self._MEDIASELECTOR_URLS:
356 try:
357 return self._download_media_selector_url(
358 mediaselector_url % programme_id, programme_id)
359 except BBCCoUkIE.MediaSelectionError as e:
d781e293 360 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
361 last_exception = e
362 continue
363 self._raise_extractor_error(e)
364 self._raise_extractor_error(last_exception)
9afa1770
S
365
366 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
367 try:
368 media_selection = self._download_xml(
9afa1770 369 url, programme_id, 'Downloading media selection XML')
c056efa2 370 except ExtractorError as ee:
d781e293 371 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 372 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 373 else:
c056efa2 374 raise
9afa1770 375 return self._process_media_selector(media_selection, programme_id)
082c6c86 376
9afa1770 377 def _process_media_selector(self, media_selection, programme_id):
082c6c86 378 formats = []
2e3fd9ec
S
379 subtitles = None
380
c056efa2
S
381 for media in self._extract_medias(media_selection):
382 kind = media.get('kind')
383 if kind == 'audio':
384 formats.extend(self._extract_audio(media, programme_id))
385 elif kind == 'video':
386 formats.extend(self._extract_video(media, programme_id))
387 elif kind == 'captions':
f13b1e7d 388 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 389 return formats, subtitles
2e3fd9ec 390
ae6986fb
S
391 def _download_playlist(self, playlist_id):
392 try:
393 playlist = self._download_json(
394 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
395 playlist_id, 'Downloading playlist JSON')
396
397 version = playlist.get('defaultAvailableVersion')
398 if version:
399 smp_config = version['smpConfig']
400 title = smp_config['title']
401 description = smp_config['summary']
402 for item in smp_config['items']:
403 kind = item['kind']
404 if kind != 'programme' and kind != 'radioProgramme':
405 continue
406 programme_id = item.get('vpid')
d97f5cd7 407 duration = int_or_none(item.get('duration'))
ae6986fb
S
408 formats, subtitles = self._download_media_selector(programme_id)
409 return programme_id, title, description, duration, formats, subtitles
410 except ExtractorError as ee:
f813928e 411 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
412 raise
413
414 # fallback to legacy playlist
9afa1770
S
415 return self._process_legacy_playlist(playlist_id)
416
417 def _process_legacy_playlist_url(self, url, display_id):
418 playlist = self._download_legacy_playlist_url(url, display_id)
419 return self._extract_from_legacy_playlist(playlist, display_id)
420
421 def _process_legacy_playlist(self, playlist_id):
422 return self._process_legacy_playlist_url(
423 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
424
425 def _download_legacy_playlist_url(self, url, playlist_id=None):
426 return self._download_xml(
427 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 428
9afa1770 429 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 430 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
431 if no_items is not None:
432 reason = no_items.get('reason')
433 if reason == 'preAvailability':
434 msg = 'Episode %s is not yet available' % playlist_id
435 elif reason == 'postAvailability':
436 msg = 'Episode %s is no longer available' % playlist_id
437 elif reason == 'noMedia':
438 msg = 'Episode %s is not currently available' % playlist_id
439 else:
440 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
441 raise ExtractorError(msg, expected=True)
442
443 for item in self._extract_items(playlist):
444 kind = item.get('kind')
445 if kind != 'programme' and kind != 'radioProgramme':
446 continue
e6174ee9
S
447 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
448 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 449 description = description_el.text if description_el is not None else None
9afa1770
S
450
451 def get_programme_id(item):
452 def get_from_attributes(item):
453 for p in('identifier', 'group'):
454 value = item.get(p)
455 if value and re.match(r'^[pb][\da-z]{7}$', value):
456 return value
457 get_from_attributes(item)
e6174ee9 458 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
459 if mediator is not None:
460 return get_from_attributes(mediator)
461
462 programme_id = get_programme_id(item)
d97f5cd7 463 duration = int_or_none(item.get('duration'))
e6174ee9
S
464
465 if programme_id:
466 formats, subtitles = self._download_media_selector(programme_id)
467 else:
468 formats, subtitles = self._process_media_selector(item, playlist_id)
469 programme_id = playlist_id
ae6986fb
S
470
471 return programme_id, title, description, duration, formats, subtitles
472
c056efa2
S
473 def _real_extract(self, url):
474 group_id = self._match_id(url)
475
476 webpage = self._download_webpage(url, group_id, 'Downloading video page')
477
8683b4d8 478 programme_id = None
679bacf0 479 duration = None
8683b4d8
S
480
481 tviplayer = self._search_regex(
482 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
483 webpage, 'player', default=None)
484
485 if tviplayer:
486 player = self._parse_json(tviplayer, group_id).get('player', {})
487 duration = int_or_none(player.get('duration'))
488 programme_id = player.get('vpid')
489
490 if not programme_id:
491 programme_id = self._search_regex(
22d7368d 492 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 493
c056efa2 494 if programme_id:
c056efa2 495 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 496 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
497 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
498 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 499 description = self._search_regex(
a8534274
S
500 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
501 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
502 webpage, 'description', default=None)
503 if not description:
504 description = self._html_search_meta('description', webpage)
c056efa2 505 else:
ae6986fb 506 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 507
082c6c86
S
508 self._sort_formats(formats)
509
510 return {
2e3fd9ec 511 'id': programme_id,
082c6c86
S
512 'title': title,
513 'description': description,
650cfd0c 514 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
515 'duration': duration,
516 'formats': formats,
2e3fd9ec 517 'subtitles': subtitles,
5f6a1245 518 }
10273d6e 519
520
9afa1770
S
521class BBCIE(BBCCoUkIE):
522 IE_NAME = 'bbc'
523 IE_DESC = 'BBC'
524 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 525
d12a1a47 526 _MEDIASELECTOR_URLS = [
55ebae26
S
527 # Provides HQ HLS streams but fails with geolocation in some cases when it's
528 # even not geo restricted at all
529 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
530 # Provides more formats, namely direct mp4 links, but fails on some videos with
531 # notukerror for non UK (?) users (e.g.
532 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
533 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
534 # Provides fewer formats, but works everywhere for everybody (hopefully)
535 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
536 ]
10273d6e 537
538 _TESTS = [{
6a747190 539 # article with multiple videos embedded with data-playable containing vpids
10273d6e 540 'url': 'http://www.bbc.com/news/world-europe-32668511',
541 'info_dict': {
542 'id': 'world-europe-32668511',
543 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 544 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 545 },
546 'playlist_count': 2,
a3bfddfa 547 }, {
6a747190 548 # article with multiple videos embedded with data-playable (more videos)
10273d6e 549 'url': 'http://www.bbc.com/news/business-28299555',
550 'info_dict': {
551 'id': 'business-28299555',
552 'title': 'Farnborough Airshow: Video highlights',
9afa1770 553 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 554 },
555 'playlist_count': 9,
9afa1770 556 'skip': 'Save time',
88ed52ae
S
557 }, {
558 # article with multiple videos embedded with `new SMP()`
6a747190 559 # broken
88ed52ae
S
560 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
561 'info_dict': {
562 'id': '3662a707-0af9-3149-963f-47bea720b460',
563 'title': 'BBC Blogs - Adam Curtis - BUGGER',
564 },
565 'playlist_count': 18,
a3bfddfa 566 }, {
6a747190 567 # single video embedded with data-playable containing vpid
10273d6e 568 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 569 'info_dict': {
570 'id': 'p02mprgb',
55ebae26 571 'ext': 'mp4',
10273d6e 572 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 573 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 574 'duration': 47,
9afa1770 575 'timestamp': 1427219242,
da92eeae 576 'upload_date': '20150324',
10273d6e 577 },
578 'params': {
9afa1770 579 # rtmp download
10273d6e 580 'skip_download': True,
581 }
a3bfddfa 582 }, {
6a747190
S
583 # article with single video embedded with data-playable containing XML playlist
584 # with direct video links as progressiveDownloadUrl (for now these are extracted)
585 # and playlist with f4m and m3u8 as streamingUrl
de939d89 586 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 587 'info_dict': {
9afa1770 588 'id': '150615_telabyad_kentin_cogu',
de939d89 589 'ext': 'mp4',
9afa1770 590 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
9afa1770 591 'timestamp': 1434397334,
da92eeae 592 'upload_date': '20150615',
de939d89 593 },
594 'params': {
595 'skip_download': True,
596 }
c936d8cc 597 }, {
6a747190 598 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 599 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 600 'info_dict': {
9afa1770 601 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 602 'ext': 'mp4',
9afa1770 603 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
9afa1770 604 'timestamp': 1434713142,
da92eeae 605 'upload_date': '20150619',
de939d89 606 },
607 'params': {
608 'skip_download': True,
609 }
a346b1ff
S
610 }, {
611 # single video from video playlist embedded with vxp-playlist-data JSON
612 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
613 'info_dict': {
614 'id': 'p02w6qjc',
55ebae26 615 'ext': 'mp4',
a346b1ff
S
616 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
617 'duration': 56,
0bc4ee60 618 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
619 },
620 'params': {
621 'skip_download': True,
622 }
9afa1770
S
623 }, {
624 # single video story with digitalData
625 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
626 'info_dict': {
627 'id': 'p02q6gc4',
628 'ext': 'flv',
629 'title': 'Sri Lanka’s spicy secret',
630 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
631 'timestamp': 1437674293,
632 'upload_date': '20150723',
633 },
634 'params': {
635 # rtmp download
636 'skip_download': True,
637 }
638 }, {
639 # single video story without digitalData
640 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
641 'info_dict': {
642 'id': 'p018zqqg',
55ebae26 643 'ext': 'mp4',
9afa1770
S
644 'title': 'Hyundai Santa Fe Sport: Rock star',
645 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
646 'timestamp': 1415867444,
647 'upload_date': '20141113',
9afa1770
S
648 },
649 'params': {
650 # rtmp download
651 'skip_download': True,
652 }
653 }, {
6a747190 654 # single video with playlist.sxml URL in playlist param
9afa1770
S
655 'url': 'http://www.bbc.com/sport/0/football/33653409',
656 'info_dict': {
657 'id': 'p02xycnp',
55ebae26 658 'ext': 'mp4',
9afa1770 659 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 660 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
661 'duration': 140,
662 },
663 'params': {
664 # rtmp download
665 'skip_download': True,
666 }
b5d48cb1 667 }, {
6a747190 668 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
669 'url': 'http://www.bbc.com/sport/0/football/34475836',
670 'info_dict': {
671 'id': '34475836',
672 'title': 'What Liverpool can expect from Klopp',
673 },
674 'playlist_count': 3,
9afa1770
S
675 }, {
676 # single video with playlist URL from weather section
677 'url': 'http://www.bbc.com/weather/features/33601775',
678 'only_matching': True,
679 }, {
680 # custom redirection to www.bbc.com
681 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
682 'only_matching': True,
10273d6e 683 }]
684
9afa1770
S
685 @classmethod
686 def suitable(cls, url):
a65402ef 687 return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
9afa1770
S
688
689 def _extract_from_media_meta(self, media_meta, video_id):
690 # Direct links to media in media metadata (e.g.
691 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
692 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
693 source_files = media_meta.get('sourceFiles')
694 if source_files:
695 return [{
696 'url': f['url'],
697 'format_id': format_id,
698 'ext': f.get('encoding'),
699 'tbr': float_or_none(f.get('bitrate'), 1000),
700 'filesize': int_or_none(f.get('filesize')),
701 } for format_id, f in source_files.items() if f.get('url')], []
702
703 programme_id = media_meta.get('externalId')
704 if programme_id:
705 return self._download_media_selector(programme_id)
706
707 # Process playlist.sxml as legacy playlist
708 href = media_meta.get('href')
709 if href:
710 playlist = self._download_legacy_playlist_url(href)
711 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
712 return formats, subtitles
713
714 return [], []
715
baf39a1a
S
716 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
717 programme_id, title, description, duration, formats, subtitles = \
718 self._process_legacy_playlist_url(url, playlist_id)
719 self._sort_formats(formats)
720 return {
721 'id': programme_id,
722 'title': title,
723 'description': description,
724 'duration': duration,
725 'timestamp': timestamp,
726 'formats': formats,
727 'subtitles': subtitles,
728 }
729
10273d6e 730 def _real_extract(self, url):
9afa1770
S
731 playlist_id = self._match_id(url)
732
733 webpage = self._download_webpage(url, playlist_id)
734
350e02d4
YCH
735 json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
736 timestamp = json_ld_info.get('timestamp')
737 playlist_title = json_ld_info.get('title')
738 playlist_description = json_ld_info.get('description')
ae8bdfd1
S
739
740 if not timestamp:
741 timestamp = parse_iso8601(self._search_regex(
742 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
743 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 744 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 745 webpage, 'date', default=None))
9afa1770 746
78f9d843
S
747 entries = []
748
de665713
S
749 # article with multiple videos embedded with playlist.sxml (e.g.
750 # http://www.bbc.com/sport/0/football/34475836)
751 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 752 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 753 if playlists:
baf39a1a
S
754 entries = [
755 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
756 for playlist_url in playlists]
de939d89 757
78f9d843
S
758 # news article with multiple videos embedded with data-playable
759 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
760 if data_playables:
761 for _, data_playable_json in data_playables:
762 data_playable = self._parse_json(
763 unescapeHTML(data_playable_json), playlist_id, fatal=False)
764 if not data_playable:
765 continue
baf39a1a
S
766 settings = data_playable.get('settings', {})
767 if settings:
78f9d843
S
768 # data-playable with video vpid in settings.playlistObject.items (e.g.
769 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
770 playlist_object = settings.get('playlistObject', {})
771 if playlist_object:
772 items = playlist_object.get('items')
773 if items and isinstance(items, list):
78f9d843
S
774 title = playlist_object['title']
775 description = playlist_object.get('summary')
baf39a1a
S
776 duration = int_or_none(items[0].get('duration'))
777 programme_id = items[0].get('vpid')
78f9d843
S
778 formats, subtitles = self._download_media_selector(programme_id)
779 self._sort_formats(formats)
780 entries.append({
781 'id': programme_id,
782 'title': title,
783 'description': description,
784 'timestamp': timestamp,
785 'duration': duration,
786 'formats': formats,
787 'subtitles': subtitles,
788 })
789 else:
790 # data-playable without vpid but with a playlist.sxml URLs
791 # in otherSettings.playlist (e.g.
792 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
793 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
794 if playlist:
795 entries.append(self._extract_from_playlist_sxml(
796 playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
797
8e4aa7bf
BC
798 playlist_title = self._og_search_title(webpage, default=None)
799 playlist_title = playlist_title or self._html_search_regex(
800 r'<title>(.*?)</title>', webpage, 'playlist title')
801
802 playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title)
803
804 playlist_description = self._og_search_description(webpage, default=None)
805
78f9d843 806 if entries:
78f9d843
S
807 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
808
809 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
810 programme_id = self._search_regex(
22d7368d
S
811 [r'data-video-player-vpid="(%s)"' % self._ID_REGEX,
812 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
813 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 814 webpage, 'vpid', default=None)
dab062fb 815
9afa1770
S
816 if programme_id:
817 formats, subtitles = self._download_media_selector(programme_id)
818 self._sort_formats(formats)
819 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
820 digital_data = self._parse_json(
821 self._search_regex(
822 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
823 programme_id, fatal=False)
824 page_info = digital_data.get('page', {}).get('pageInfo', {})
825 title = page_info.get('pageName') or self._og_search_title(webpage)
826 description = page_info.get('description') or self._og_search_description(webpage)
827 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
828 return {
829 'id': programme_id,
830 'title': title,
831 'description': description,
832 'timestamp': timestamp,
833 'formats': formats,
834 'subtitles': subtitles,
835 }
a3bfddfa 836
88ed52ae
S
837 def extract_all(pattern):
838 return list(filter(None, map(
839 lambda s: self._parse_json(s, playlist_id, fatal=False),
840 re.findall(pattern, webpage))))
841
842 # Multiple video article (e.g.
843 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 844 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
845 entries = []
846 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
847 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
848 if embed_url and re.match(EMBED_URL, embed_url):
849 entries.append(embed_url)
850 entries.extend(re.findall(
851 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
852 if entries:
853 return self.playlist_result(
854 [self.url_result(entry, 'BBCCoUk') for entry in entries],
855 playlist_id, playlist_title, playlist_description)
9afa1770
S
856
857 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 858 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
859
860 if not medias:
861 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
862 media_asset = self._search_regex(
863 r'mediaAssetPage\.init\(\s*({.+?}), "/',
864 webpage, 'media asset', default=None)
865 if media_asset:
866 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
867 medias = []
868 for video in media_asset_page.get('videos', {}).values():
869 medias.extend(video.values())
870
871 if not medias:
872 # Multiple video playlist with single `now playing` entry (e.g.
873 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
874 vxp_playlist = self._parse_json(
9afa1770 875 self._search_regex(
a346b1ff
S
876 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
877 webpage, 'playlist data'),
9afa1770 878 playlist_id)
a346b1ff
S
879 playlist_medias = []
880 for item in vxp_playlist:
881 media = item.get('media')
882 if not media:
883 continue
884 playlist_medias.append(media)
885 # Download single video if found media with asset id matching the video id from URL
886 if item.get('advert', {}).get('assetId') == playlist_id:
887 medias = [media]
888 break
889 # Fallback to the whole playlist
890 if not medias:
891 medias = playlist_medias
9afa1770
S
892
893 entries = []
894 for num, media_meta in enumerate(medias, start=1):
895 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
896 if not formats:
897 continue
10273d6e 898 self._sort_formats(formats)
899
9afa1770
S
900 video_id = media_meta.get('externalId')
901 if not video_id:
902 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
903
904 title = media_meta.get('caption')
905 if not title:
906 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
907
908 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 909
9afa1770
S
910 images = []
911 for image in media_meta.get('images', {}).values():
912 images.extend(image.values())
913 if 'image' in media_meta:
914 images.append(media_meta['image'])
915
916 thumbnails = [{
917 'url': image.get('href'),
918 'width': int_or_none(image.get('width')),
919 'height': int_or_none(image.get('height')),
920 } for image in images]
921
922 entries.append({
923 'id': video_id,
10273d6e 924 'title': title,
9afa1770 925 'thumbnails': thumbnails,
10273d6e 926 'duration': duration,
9afa1770 927 'timestamp': timestamp,
10273d6e 928 'formats': formats,
929 'subtitles': subtitles,
a3bfddfa 930 })
10273d6e 931
9afa1770 932 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
933
934
935class BBCCoUkArticleIE(InfoExtractor):
936 _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
937 IE_NAME = 'bbc.co.uk:article'
938 IE_DESC = 'BBC articles'
939
940 _TEST = {
941 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
942 'info_dict': {
943 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
944 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
945 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
946 },
947 'playlist_count': 4,
948 'add_ie': ['BBCCoUk'],
949 }
950
951 def _real_extract(self, url):
952 playlist_id = self._match_id(url)
953
954 webpage = self._download_webpage(url, playlist_id)
955
956 title = self._og_search_title(webpage)
957 description = self._og_search_description(webpage).strip()
958
959 entries = [self.url_result(programme_url) for programme_url in re.findall(
960 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
961
962 return self.playlist_result(entries, playlist_id, title, description)