]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[audiomack] Update the test
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
082c6c86 5
f13b1e7d 6from .common import InfoExtractor
8683b4d8
S
7from ..utils import (
8 ExtractorError,
9afa1770 9 float_or_none,
8683b4d8 10 int_or_none,
9afa1770
S
11 parse_duration,
12 parse_iso8601,
dab062fb 13 unescapeHTML,
8683b4d8 14)
36e6f62c
JMF
15from ..compat import (
16 compat_etree_fromstring,
17 compat_HTTPError,
18)
082c6c86 19
d12a1a47 20
f13b1e7d 21class BBCCoUkIE(InfoExtractor):
082c6c86 22 IE_NAME = 'bbc.co.uk'
2e3fd9ec 23 IE_DESC = 'BBC iPlayer'
22d7368d 24 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
25 _VALID_URL = r'''(?x)
26 https?://
27 (?:www\.)?bbc\.co\.uk/
28 (?:
29 programmes/(?!articles/)|
30 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
31 music/clips[/#]|
32 radio/player/
33 )
34 (?P<id>%s)
35 ''' % _ID_REGEX
082c6c86 36
d12a1a47 37 _MEDIASELECTOR_URLS = [
26ccc68b
S
38 # Provides HQ HLS streams with even better quality that pc mediaset but fails
39 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 40 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 41 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
42 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
43 ]
a8b081a0 44
e6174ee9
S
45 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
46 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
47
48 _NAMESPACES = (
49 _MEDIASELECTION_NS,
50 _EMP_PLAYLIST_NS,
51 )
52
2e3fd9ec
S
53 _TESTS = [
54 {
f2d0fc68 55 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 56 'info_dict': {
f2d0fc68 57 'id': 'b039d07m',
2e3fd9ec 58 'ext': 'flv',
679bacf0 59 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 60 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
61 },
62 'params': {
63 # rtmp download
64 'skip_download': True,
65 }
082c6c86 66 },
2e3fd9ec
S
67 {
68 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
69 'info_dict': {
70 'id': 'b00yng1d',
71 'ext': 'flv',
72 'title': 'The Man in Black: Series 3: The Printed Name',
73 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
74 'duration': 1800,
75 },
76 'params': {
77 # rtmp download
78 'skip_download': True,
c7f0177f
S
79 },
80 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
81 },
82 {
83 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
84 'info_dict': {
85 'id': 'b00yng1d',
86 'ext': 'flv',
17968e44 87 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 88 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 89 'duration': 5100,
2e3fd9ec
S
90 },
91 'params': {
92 # rtmp download
93 'skip_download': True,
94 },
95 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
96 },
97 {
98 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
99 'info_dict': {
100 'id': 'b03k3pb7',
101 'ext': 'flv',
102 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
103 'description': '2. Invasion',
104 'duration': 3600,
105 },
106 'params': {
107 # rtmp download
108 'skip_download': True,
109 },
110 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
111 }, {
112 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
113 'info_dict': {
114 'id': 'b04v209v',
115 'ext': 'flv',
116 'title': 'Pete Tong, The Essential New Tune Special',
117 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
118 'duration': 10800,
119 },
120 'params': {
121 # rtmp download
122 'skip_download': True,
a3ef0e1c
YCH
123 },
124 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 125 }, {
5aa535c3 126 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
127 'note': 'Audio',
128 'info_dict': {
5aa535c3 129 'id': 'p022h44j',
c7e67594 130 'ext': 'flv',
5aa535c3
S
131 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
132 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
133 'duration': 227,
c7e67594
S
134 },
135 'params': {
136 # rtmp download
137 'skip_download': True,
138 }
139 }, {
140 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
141 'note': 'Video',
142 'info_dict': {
143 'id': 'p025c103',
144 'ext': 'flv',
145 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
146 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
147 'duration': 226,
148 },
149 'params': {
150 # rtmp download
151 'skip_download': True,
152 }
e68ae99a
S
153 }, {
154 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
155 'info_dict': {
156 'id': 'p02n76xf',
157 'ext': 'flv',
158 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
159 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
160 'duration': 3540,
161 },
162 'params': {
163 # rtmp download
164 'skip_download': True,
165 },
166 'skip': 'geolocation',
25fa8d66
YCH
167 }, {
168 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
169 'info_dict': {
170 'id': 'b05zmgw1',
171 'ext': 'flv',
172 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
173 'title': 'Royal Academy Summer Exhibition',
174 'duration': 3540,
175 },
176 'params': {
177 # rtmp download
178 'skip_download': True,
179 },
180 'skip': 'geolocation',
54914380
S
181 }, {
182 # iptv-all mediaset fails with geolocation however there is no geo restriction
183 # for this programme at all
5aa535c3 184 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 185 'info_dict': {
5aa535c3 186 'id': 'b06rkms3',
54914380 187 'ext': 'flv',
5aa535c3
S
188 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
189 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
190 },
191 'params': {
192 # rtmp download
193 'skip_download': True,
194 },
1ac6e794
S
195 }, {
196 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
197 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
198 'info_dict': {
199 'id': 'p028bfkj',
200 'ext': 'flv',
201 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
202 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
203 },
204 'params': {
205 # rtmp download
206 'skip_download': True,
207 },
31763975
S
208 }, {
209 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
210 'only_matching': True,
c7e67594
S
211 }, {
212 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
213 'only_matching': True,
0692ef86
S
214 }, {
215 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
216 'only_matching': True,
f20a11ed
S
217 }, {
218 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
219 'only_matching': True,
ae6986fb 220 }
2e3fd9ec
S
221 ]
222
d12a1a47
S
223 class MediaSelectionError(Exception):
224 def __init__(self, id):
225 self.id = id
226
2e3fd9ec
S
227 def _extract_asx_playlist(self, connection, programme_id):
228 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
229 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
230
231 def _extract_connection(self, connection, programme_id):
232 formats = []
e6174ee9 233 kind = connection.get('kind')
2e3fd9ec
S
234 protocol = connection.get('protocol')
235 supplier = connection.get('supplier')
236 if protocol == 'http':
237 href = connection.get('href')
7a896817 238 transfer_format = connection.get('transferFormat')
2e3fd9ec
S
239 # ASX playlist
240 if supplier == 'asx':
241 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
242 formats.append({
243 'url': ref,
244 'format_id': 'ref%s_%s' % (i, supplier),
245 })
7a896817
S
246 # Skip DASH until supported
247 elif transfer_format == 'dash':
248 pass
d1c694ea 249 elif transfer_format == 'hls':
7e5edcfd 250 formats.extend(self._extract_m3u8_formats(
d1c694ea 251 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
7e5edcfd 252 m3u8_id=supplier, fatal=False))
2e3fd9ec
S
253 # Direct link
254 else:
255 formats.append({
256 'url': href,
e6174ee9 257 'format_id': supplier or kind or protocol,
2e3fd9ec
S
258 })
259 elif protocol == 'rtmp':
260 application = connection.get('application', 'ondemand')
261 auth_string = connection.get('authString')
262 identifier = connection.get('identifier')
263 server = connection.get('server')
264 formats.append({
265 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
266 'play_path': identifier,
267 'app': '%s?%s' % (application, auth_string),
268 'page_url': 'http://www.bbc.co.uk',
269 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
270 'rtmp_live': False,
271 'ext': 'flv',
272 'format_id': supplier,
273 })
274 return formats
275
276 def _extract_items(self, playlist):
e6174ee9
S
277 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
278
279 def _findall_ns(self, element, xpath):
280 elements = []
281 for ns in self._NAMESPACES:
282 elements.extend(element.findall(xpath % ns))
283 return elements
2e3fd9ec
S
284
285 def _extract_medias(self, media_selection):
e6174ee9
S
286 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
287 if error is None:
288 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 289 if error is not None:
d12a1a47 290 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 291 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
292
293 def _extract_connections(self, media):
e6174ee9 294 return self._findall_ns(media, './{%s}connection')
2e3fd9ec
S
295
296 def _extract_video(self, media, programme_id):
297 formats = []
d97f5cd7 298 vbr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
299 vcodec = media.get('encoding')
300 service = media.get('service')
d97f5cd7
S
301 width = int_or_none(media.get('width'))
302 height = int_or_none(media.get('height'))
303 file_size = int_or_none(media.get('media_file_size'))
2e3fd9ec
S
304 for connection in self._extract_connections(media):
305 conn_formats = self._extract_connection(connection, programme_id)
306 for format in conn_formats:
307 format.update({
2e3fd9ec
S
308 'width': width,
309 'height': height,
310 'vbr': vbr,
311 'vcodec': vcodec,
312 'filesize': file_size,
313 })
e6174ee9
S
314 if service:
315 format['format_id'] = '%s_%s' % (service, format['format_id'])
2e3fd9ec
S
316 formats.extend(conn_formats)
317 return formats
318
319 def _extract_audio(self, media, programme_id):
320 formats = []
d97f5cd7 321 abr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
322 acodec = media.get('encoding')
323 service = media.get('service')
324 for connection in self._extract_connections(media):
325 conn_formats = self._extract_connection(connection, programme_id)
326 for format in conn_formats:
327 format.update({
328 'format_id': '%s_%s' % (service, format['format_id']),
329 'abr': abr,
330 'acodec': acodec,
bbc26c8a 331 'vcodec': 'none',
2e3fd9ec
S
332 })
333 formats.extend(conn_formats)
334 return formats
335
f13b1e7d 336 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
337 subtitles = {}
338 for connection in self._extract_connections(media):
339 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
340 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
341 subtitles[lang] = [
342 {
343 'url': connection.get('href'),
344 'ext': 'ttml',
345 },
f13b1e7d 346 ]
2e3fd9ec 347 return subtitles
082c6c86 348
d12a1a47
S
349 def _raise_extractor_error(self, media_selection_error):
350 raise ExtractorError(
351 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
352 expected=True)
353
c056efa2 354 def _download_media_selector(self, programme_id):
d12a1a47
S
355 last_exception = None
356 for mediaselector_url in self._MEDIASELECTOR_URLS:
357 try:
358 return self._download_media_selector_url(
359 mediaselector_url % programme_id, programme_id)
360 except BBCCoUkIE.MediaSelectionError as e:
d781e293 361 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
362 last_exception = e
363 continue
364 self._raise_extractor_error(e)
365 self._raise_extractor_error(last_exception)
9afa1770
S
366
367 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
368 try:
369 media_selection = self._download_xml(
9afa1770 370 url, programme_id, 'Downloading media selection XML')
c056efa2 371 except ExtractorError as ee:
d781e293 372 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 373 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 374 else:
c056efa2 375 raise
9afa1770 376 return self._process_media_selector(media_selection, programme_id)
082c6c86 377
9afa1770 378 def _process_media_selector(self, media_selection, programme_id):
082c6c86 379 formats = []
2e3fd9ec
S
380 subtitles = None
381
c056efa2
S
382 for media in self._extract_medias(media_selection):
383 kind = media.get('kind')
384 if kind == 'audio':
385 formats.extend(self._extract_audio(media, programme_id))
386 elif kind == 'video':
387 formats.extend(self._extract_video(media, programme_id))
388 elif kind == 'captions':
f13b1e7d 389 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 390 return formats, subtitles
2e3fd9ec 391
ae6986fb
S
392 def _download_playlist(self, playlist_id):
393 try:
394 playlist = self._download_json(
395 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
396 playlist_id, 'Downloading playlist JSON')
397
398 version = playlist.get('defaultAvailableVersion')
399 if version:
400 smp_config = version['smpConfig']
401 title = smp_config['title']
402 description = smp_config['summary']
403 for item in smp_config['items']:
404 kind = item['kind']
405 if kind != 'programme' and kind != 'radioProgramme':
406 continue
407 programme_id = item.get('vpid')
d97f5cd7 408 duration = int_or_none(item.get('duration'))
ae6986fb
S
409 formats, subtitles = self._download_media_selector(programme_id)
410 return programme_id, title, description, duration, formats, subtitles
411 except ExtractorError as ee:
f813928e 412 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
413 raise
414
415 # fallback to legacy playlist
9afa1770
S
416 return self._process_legacy_playlist(playlist_id)
417
418 def _process_legacy_playlist_url(self, url, display_id):
419 playlist = self._download_legacy_playlist_url(url, display_id)
420 return self._extract_from_legacy_playlist(playlist, display_id)
421
422 def _process_legacy_playlist(self, playlist_id):
423 return self._process_legacy_playlist_url(
424 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
425
426 def _download_legacy_playlist_url(self, url, playlist_id=None):
427 return self._download_xml(
428 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 429
9afa1770 430 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 431 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
432 if no_items is not None:
433 reason = no_items.get('reason')
434 if reason == 'preAvailability':
435 msg = 'Episode %s is not yet available' % playlist_id
436 elif reason == 'postAvailability':
437 msg = 'Episode %s is no longer available' % playlist_id
438 elif reason == 'noMedia':
439 msg = 'Episode %s is not currently available' % playlist_id
440 else:
441 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
442 raise ExtractorError(msg, expected=True)
443
444 for item in self._extract_items(playlist):
445 kind = item.get('kind')
446 if kind != 'programme' and kind != 'radioProgramme':
447 continue
e6174ee9
S
448 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
449 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 450 description = description_el.text if description_el is not None else None
9afa1770
S
451
452 def get_programme_id(item):
453 def get_from_attributes(item):
454 for p in('identifier', 'group'):
455 value = item.get(p)
456 if value and re.match(r'^[pb][\da-z]{7}$', value):
457 return value
458 get_from_attributes(item)
e6174ee9 459 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
460 if mediator is not None:
461 return get_from_attributes(mediator)
462
463 programme_id = get_programme_id(item)
d97f5cd7 464 duration = int_or_none(item.get('duration'))
e6174ee9
S
465
466 if programme_id:
467 formats, subtitles = self._download_media_selector(programme_id)
468 else:
469 formats, subtitles = self._process_media_selector(item, playlist_id)
470 programme_id = playlist_id
ae6986fb
S
471
472 return programme_id, title, description, duration, formats, subtitles
473
c056efa2
S
474 def _real_extract(self, url):
475 group_id = self._match_id(url)
476
477 webpage = self._download_webpage(url, group_id, 'Downloading video page')
478
8683b4d8 479 programme_id = None
679bacf0 480 duration = None
8683b4d8
S
481
482 tviplayer = self._search_regex(
483 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
484 webpage, 'player', default=None)
485
486 if tviplayer:
487 player = self._parse_json(tviplayer, group_id).get('player', {})
488 duration = int_or_none(player.get('duration'))
489 programme_id = player.get('vpid')
490
491 if not programme_id:
492 programme_id = self._search_regex(
22d7368d 493 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 494
c056efa2 495 if programme_id:
c056efa2 496 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 497 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
498 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
499 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 500 description = self._search_regex(
a8534274
S
501 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
502 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
503 webpage, 'description', default=None)
504 if not description:
505 description = self._html_search_meta('description', webpage)
c056efa2 506 else:
ae6986fb 507 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 508
082c6c86
S
509 self._sort_formats(formats)
510
511 return {
2e3fd9ec 512 'id': programme_id,
082c6c86
S
513 'title': title,
514 'description': description,
650cfd0c 515 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
516 'duration': duration,
517 'formats': formats,
2e3fd9ec 518 'subtitles': subtitles,
5f6a1245 519 }
10273d6e 520
521
9afa1770
S
522class BBCIE(BBCCoUkIE):
523 IE_NAME = 'bbc'
524 IE_DESC = 'BBC'
525 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 526
d12a1a47 527 _MEDIASELECTOR_URLS = [
55ebae26
S
528 # Provides HQ HLS streams but fails with geolocation in some cases when it's
529 # even not geo restricted at all
530 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
531 # Provides more formats, namely direct mp4 links, but fails on some videos with
532 # notukerror for non UK (?) users (e.g.
533 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
534 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
535 # Provides fewer formats, but works everywhere for everybody (hopefully)
536 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
537 ]
10273d6e 538
539 _TESTS = [{
6a747190 540 # article with multiple videos embedded with data-playable containing vpids
10273d6e 541 'url': 'http://www.bbc.com/news/world-europe-32668511',
542 'info_dict': {
543 'id': 'world-europe-32668511',
544 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 545 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 546 },
547 'playlist_count': 2,
a3bfddfa 548 }, {
6a747190 549 # article with multiple videos embedded with data-playable (more videos)
10273d6e 550 'url': 'http://www.bbc.com/news/business-28299555',
551 'info_dict': {
552 'id': 'business-28299555',
553 'title': 'Farnborough Airshow: Video highlights',
9afa1770 554 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 555 },
556 'playlist_count': 9,
9afa1770 557 'skip': 'Save time',
88ed52ae
S
558 }, {
559 # article with multiple videos embedded with `new SMP()`
6a747190 560 # broken
88ed52ae
S
561 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
562 'info_dict': {
563 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 564 'title': 'BUGGER',
88ed52ae
S
565 },
566 'playlist_count': 18,
a3bfddfa 567 }, {
6a747190 568 # single video embedded with data-playable containing vpid
10273d6e 569 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 570 'info_dict': {
571 'id': 'p02mprgb',
55ebae26 572 'ext': 'mp4',
10273d6e 573 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 574 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 575 'duration': 47,
9afa1770 576 'timestamp': 1427219242,
da92eeae 577 'upload_date': '20150324',
10273d6e 578 },
579 'params': {
9afa1770 580 # rtmp download
10273d6e 581 'skip_download': True,
582 }
a3bfddfa 583 }, {
6a747190
S
584 # article with single video embedded with data-playable containing XML playlist
585 # with direct video links as progressiveDownloadUrl (for now these are extracted)
586 # and playlist with f4m and m3u8 as streamingUrl
de939d89 587 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 588 'info_dict': {
9afa1770 589 'id': '150615_telabyad_kentin_cogu',
de939d89 590 'ext': 'mp4',
9afa1770 591 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
9afa1770 592 'timestamp': 1434397334,
da92eeae 593 'upload_date': '20150615',
de939d89 594 },
595 'params': {
596 'skip_download': True,
597 }
c936d8cc 598 }, {
6a747190 599 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 600 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 601 'info_dict': {
9afa1770 602 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 603 'ext': 'mp4',
9afa1770 604 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
9afa1770 605 'timestamp': 1434713142,
da92eeae 606 'upload_date': '20150619',
de939d89 607 },
608 'params': {
609 'skip_download': True,
610 }
a346b1ff
S
611 }, {
612 # single video from video playlist embedded with vxp-playlist-data JSON
613 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
614 'info_dict': {
615 'id': 'p02w6qjc',
55ebae26 616 'ext': 'mp4',
a346b1ff
S
617 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
618 'duration': 56,
0bc4ee60 619 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
620 },
621 'params': {
622 'skip_download': True,
623 }
9afa1770
S
624 }, {
625 # single video story with digitalData
626 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
627 'info_dict': {
628 'id': 'p02q6gc4',
629 'ext': 'flv',
630 'title': 'Sri Lanka’s spicy secret',
631 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
632 'timestamp': 1437674293,
633 'upload_date': '20150723',
634 },
635 'params': {
636 # rtmp download
637 'skip_download': True,
638 }
639 }, {
640 # single video story without digitalData
641 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
642 'info_dict': {
643 'id': 'p018zqqg',
55ebae26 644 'ext': 'mp4',
9afa1770
S
645 'title': 'Hyundai Santa Fe Sport: Rock star',
646 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
647 'timestamp': 1415867444,
648 'upload_date': '20141113',
9afa1770
S
649 },
650 'params': {
651 # rtmp download
652 'skip_download': True,
653 }
654 }, {
6a747190 655 # single video with playlist.sxml URL in playlist param
9afa1770
S
656 'url': 'http://www.bbc.com/sport/0/football/33653409',
657 'info_dict': {
658 'id': 'p02xycnp',
55ebae26 659 'ext': 'mp4',
9afa1770 660 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 661 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
662 'duration': 140,
663 },
664 'params': {
665 # rtmp download
666 'skip_download': True,
667 }
b5d48cb1 668 }, {
6a747190 669 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
670 'url': 'http://www.bbc.com/sport/0/football/34475836',
671 'info_dict': {
672 'id': '34475836',
450b233c 673 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
b5d48cb1
S
674 },
675 'playlist_count': 3,
450b233c
S
676 }, {
677 # school report article with single video
678 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
679 'info_dict': {
680 'id': '35744779',
681 'title': 'School which breaks down barriers in Jerusalem',
682 },
683 'playlist_count': 1,
9afa1770
S
684 }, {
685 # single video with playlist URL from weather section
686 'url': 'http://www.bbc.com/weather/features/33601775',
687 'only_matching': True,
688 }, {
689 # custom redirection to www.bbc.com
690 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
691 'only_matching': True,
a1cf3e38
S
692 }, {
693 # single video article embedded with data-media-vpid
694 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
695 'only_matching': True,
10273d6e 696 }]
697
9afa1770
S
698 @classmethod
699 def suitable(cls, url):
a65402ef 700 return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
9afa1770
S
701
702 def _extract_from_media_meta(self, media_meta, video_id):
703 # Direct links to media in media metadata (e.g.
704 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
705 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
706 source_files = media_meta.get('sourceFiles')
707 if source_files:
708 return [{
709 'url': f['url'],
710 'format_id': format_id,
711 'ext': f.get('encoding'),
712 'tbr': float_or_none(f.get('bitrate'), 1000),
713 'filesize': int_or_none(f.get('filesize')),
714 } for format_id, f in source_files.items() if f.get('url')], []
715
716 programme_id = media_meta.get('externalId')
717 if programme_id:
718 return self._download_media_selector(programme_id)
719
720 # Process playlist.sxml as legacy playlist
721 href = media_meta.get('href')
722 if href:
723 playlist = self._download_legacy_playlist_url(href)
724 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
725 return formats, subtitles
726
727 return [], []
728
baf39a1a
S
729 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
730 programme_id, title, description, duration, formats, subtitles = \
731 self._process_legacy_playlist_url(url, playlist_id)
732 self._sort_formats(formats)
733 return {
734 'id': programme_id,
735 'title': title,
736 'description': description,
737 'duration': duration,
738 'timestamp': timestamp,
739 'formats': formats,
740 'subtitles': subtitles,
741 }
742
10273d6e 743 def _real_extract(self, url):
9afa1770
S
744 playlist_id = self._match_id(url)
745
746 webpage = self._download_webpage(url, playlist_id)
747
350e02d4
YCH
748 json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
749 timestamp = json_ld_info.get('timestamp')
0e832c2c 750
350e02d4 751 playlist_title = json_ld_info.get('title')
0e832c2c
S
752 if not playlist_title:
753 playlist_title = self._og_search_title(
754 webpage, default=None) or self._html_search_regex(
755 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
756 if playlist_title:
757 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
758
759 playlist_description = json_ld_info.get(
760 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
761
762 if not timestamp:
763 timestamp = parse_iso8601(self._search_regex(
764 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
765 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 766 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 767 webpage, 'date', default=None))
9afa1770 768
78f9d843
S
769 entries = []
770
de665713
S
771 # article with multiple videos embedded with playlist.sxml (e.g.
772 # http://www.bbc.com/sport/0/football/34475836)
773 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 774 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 775 if playlists:
baf39a1a
S
776 entries = [
777 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
778 for playlist_url in playlists]
de939d89 779
78f9d843
S
780 # news article with multiple videos embedded with data-playable
781 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
782 if data_playables:
783 for _, data_playable_json in data_playables:
784 data_playable = self._parse_json(
785 unescapeHTML(data_playable_json), playlist_id, fatal=False)
786 if not data_playable:
787 continue
baf39a1a
S
788 settings = data_playable.get('settings', {})
789 if settings:
78f9d843
S
790 # data-playable with video vpid in settings.playlistObject.items (e.g.
791 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
792 playlist_object = settings.get('playlistObject', {})
793 if playlist_object:
794 items = playlist_object.get('items')
795 if items and isinstance(items, list):
78f9d843
S
796 title = playlist_object['title']
797 description = playlist_object.get('summary')
baf39a1a
S
798 duration = int_or_none(items[0].get('duration'))
799 programme_id = items[0].get('vpid')
78f9d843
S
800 formats, subtitles = self._download_media_selector(programme_id)
801 self._sort_formats(formats)
802 entries.append({
803 'id': programme_id,
804 'title': title,
805 'description': description,
806 'timestamp': timestamp,
807 'duration': duration,
808 'formats': formats,
809 'subtitles': subtitles,
810 })
811 else:
812 # data-playable without vpid but with a playlist.sxml URLs
813 # in otherSettings.playlist (e.g.
814 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
815 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
816 if playlist:
817 entries.append(self._extract_from_playlist_sxml(
818 playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
819
820 if entries:
78f9d843
S
821 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
822
823 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
824 programme_id = self._search_regex(
a1cf3e38 825 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
826 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
827 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 828 webpage, 'vpid', default=None)
dab062fb 829
9afa1770
S
830 if programme_id:
831 formats, subtitles = self._download_media_selector(programme_id)
832 self._sort_formats(formats)
833 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
834 digital_data = self._parse_json(
835 self._search_regex(
836 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
837 programme_id, fatal=False)
838 page_info = digital_data.get('page', {}).get('pageInfo', {})
839 title = page_info.get('pageName') or self._og_search_title(webpage)
840 description = page_info.get('description') or self._og_search_description(webpage)
841 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
842 return {
843 'id': programme_id,
844 'title': title,
845 'description': description,
846 'timestamp': timestamp,
847 'formats': formats,
848 'subtitles': subtitles,
849 }
a3bfddfa 850
88ed52ae
S
851 def extract_all(pattern):
852 return list(filter(None, map(
853 lambda s: self._parse_json(s, playlist_id, fatal=False),
854 re.findall(pattern, webpage))))
855
856 # Multiple video article (e.g.
857 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 858 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
859 entries = []
860 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
861 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
862 if embed_url and re.match(EMBED_URL, embed_url):
863 entries.append(embed_url)
864 entries.extend(re.findall(
865 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
866 if entries:
867 return self.playlist_result(
868 [self.url_result(entry, 'BBCCoUk') for entry in entries],
869 playlist_id, playlist_title, playlist_description)
9afa1770
S
870
871 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 872 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
873
874 if not medias:
875 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
876 media_asset = self._search_regex(
877 r'mediaAssetPage\.init\(\s*({.+?}), "/',
878 webpage, 'media asset', default=None)
879 if media_asset:
880 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
881 medias = []
882 for video in media_asset_page.get('videos', {}).values():
883 medias.extend(video.values())
884
885 if not medias:
886 # Multiple video playlist with single `now playing` entry (e.g.
887 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
888 vxp_playlist = self._parse_json(
9afa1770 889 self._search_regex(
a346b1ff
S
890 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
891 webpage, 'playlist data'),
9afa1770 892 playlist_id)
a346b1ff
S
893 playlist_medias = []
894 for item in vxp_playlist:
895 media = item.get('media')
896 if not media:
897 continue
898 playlist_medias.append(media)
899 # Download single video if found media with asset id matching the video id from URL
900 if item.get('advert', {}).get('assetId') == playlist_id:
901 medias = [media]
902 break
903 # Fallback to the whole playlist
904 if not medias:
905 medias = playlist_medias
9afa1770
S
906
907 entries = []
908 for num, media_meta in enumerate(medias, start=1):
909 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
910 if not formats:
911 continue
10273d6e 912 self._sort_formats(formats)
913
9afa1770
S
914 video_id = media_meta.get('externalId')
915 if not video_id:
916 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
917
918 title = media_meta.get('caption')
919 if not title:
920 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
921
922 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 923
9afa1770
S
924 images = []
925 for image in media_meta.get('images', {}).values():
926 images.extend(image.values())
927 if 'image' in media_meta:
928 images.append(media_meta['image'])
929
930 thumbnails = [{
931 'url': image.get('href'),
932 'width': int_or_none(image.get('width')),
933 'height': int_or_none(image.get('height')),
934 } for image in images]
935
936 entries.append({
937 'id': video_id,
10273d6e 938 'title': title,
9afa1770 939 'thumbnails': thumbnails,
10273d6e 940 'duration': duration,
9afa1770 941 'timestamp': timestamp,
10273d6e 942 'formats': formats,
943 'subtitles': subtitles,
a3bfddfa 944 })
10273d6e 945
9afa1770 946 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
947
948
949class BBCCoUkArticleIE(InfoExtractor):
5886b38d 950 _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
951 IE_NAME = 'bbc.co.uk:article'
952 IE_DESC = 'BBC articles'
953
954 _TEST = {
955 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
956 'info_dict': {
957 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
958 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
959 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
960 },
961 'playlist_count': 4,
962 'add_ie': ['BBCCoUk'],
963 }
964
965 def _real_extract(self, url):
966 playlist_id = self._match_id(url)
967
968 webpage = self._download_webpage(url, playlist_id)
969
970 title = self._og_search_title(webpage)
971 description = self._og_search_description(webpage).strip()
972
973 entries = [self.url_result(programme_url) for programme_url in re.findall(
974 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
975
976 return self.playlist_result(entries, playlist_id, title, description)