]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
Merge branch 'pr-bbcnews' of https://github.com/atomicdryad/youtube-dl into atomicdry...
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
082c6c86
S
1from __future__ import unicode_literals
2
c056efa2 3import xml.etree.ElementTree
082c6c86 4
f13b1e7d 5from .common import InfoExtractor
8683b4d8
S
6from ..utils import (
7 ExtractorError,
10273d6e 8 parse_duration,
8683b4d8
S
9 int_or_none,
10)
c056efa2 11from ..compat import compat_HTTPError
10273d6e 12import re
082c6c86
S
13
14
f13b1e7d 15class BBCCoUkIE(InfoExtractor):
082c6c86 16 IE_NAME = 'bbc.co.uk'
2e3fd9ec 17 IE_DESC = 'BBC iPlayer'
0692ef86 18 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
082c6c86 19
a8b081a0 20 mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
21
2e3fd9ec
S
22 _TESTS = [
23 {
f2d0fc68 24 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 25 'info_dict': {
f2d0fc68 26 'id': 'b039d07m',
2e3fd9ec 27 'ext': 'flv',
c4914185
S
28 'title': 'Kaleidoscope, Leonard Cohen',
29 'description': 'The Canadian poet and songwriter reflects on his musical career.',
f2d0fc68 30 'duration': 1740,
2e3fd9ec
S
31 },
32 'params': {
33 # rtmp download
34 'skip_download': True,
35 }
082c6c86 36 },
2e3fd9ec
S
37 {
38 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
39 'info_dict': {
40 'id': 'b00yng1d',
41 'ext': 'flv',
42 'title': 'The Man in Black: Series 3: The Printed Name',
43 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
44 'duration': 1800,
45 },
46 'params': {
47 # rtmp download
48 'skip_download': True,
c7f0177f
S
49 },
50 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
51 },
52 {
53 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
54 'info_dict': {
55 'id': 'b00yng1d',
56 'ext': 'flv',
17968e44
S
57 'title': 'The Voice UK: Series 3: Blind Auditions 5',
58 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
59 'duration': 5100,
2e3fd9ec
S
60 },
61 'params': {
62 # rtmp download
63 'skip_download': True,
64 },
65 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
66 },
67 {
68 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
69 'info_dict': {
70 'id': 'b03k3pb7',
71 'ext': 'flv',
72 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
73 'description': '2. Invasion',
74 'duration': 3600,
75 },
76 'params': {
77 # rtmp download
78 'skip_download': True,
79 },
80 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
81 }, {
82 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
83 'info_dict': {
84 'id': 'b04v209v',
85 'ext': 'flv',
86 'title': 'Pete Tong, The Essential New Tune Special',
87 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
88 'duration': 10800,
89 },
90 'params': {
91 # rtmp download
92 'skip_download': True,
93 }
c7e67594
S
94 }, {
95 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
96 'note': 'Audio',
97 'info_dict': {
98 'id': 'p02frcch',
99 'ext': 'flv',
100 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
101 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
102 'duration': 3507,
103 },
104 'params': {
105 # rtmp download
106 'skip_download': True,
107 }
108 }, {
109 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
110 'note': 'Video',
111 'info_dict': {
112 'id': 'p025c103',
113 'ext': 'flv',
114 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
115 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
116 'duration': 226,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 }
e68ae99a
S
122 }, {
123 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
124 'info_dict': {
125 'id': 'p02n76xf',
126 'ext': 'flv',
127 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
128 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
129 'duration': 3540,
130 },
131 'params': {
132 # rtmp download
133 'skip_download': True,
134 },
135 'skip': 'geolocation',
25fa8d66
YCH
136 }, {
137 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
138 'info_dict': {
139 'id': 'b05zmgw1',
140 'ext': 'flv',
141 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
142 'title': 'Royal Academy Summer Exhibition',
143 'duration': 3540,
144 },
145 'params': {
146 # rtmp download
147 'skip_download': True,
148 },
149 'skip': 'geolocation',
31763975
S
150 }, {
151 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
152 'only_matching': True,
c7e67594
S
153 }, {
154 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
155 'only_matching': True,
0692ef86
S
156 }, {
157 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
158 'only_matching': True,
ae6986fb 159 }
2e3fd9ec
S
160 ]
161
162 def _extract_asx_playlist(self, connection, programme_id):
163 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
164 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
165
166 def _extract_connection(self, connection, programme_id):
167 formats = []
168 protocol = connection.get('protocol')
169 supplier = connection.get('supplier')
170 if protocol == 'http':
171 href = connection.get('href')
172 # ASX playlist
173 if supplier == 'asx':
174 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
175 formats.append({
176 'url': ref,
177 'format_id': 'ref%s_%s' % (i, supplier),
178 })
179 # Direct link
180 else:
181 formats.append({
182 'url': href,
183 'format_id': supplier,
184 })
185 elif protocol == 'rtmp':
186 application = connection.get('application', 'ondemand')
187 auth_string = connection.get('authString')
188 identifier = connection.get('identifier')
189 server = connection.get('server')
190 formats.append({
191 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
192 'play_path': identifier,
193 'app': '%s?%s' % (application, auth_string),
194 'page_url': 'http://www.bbc.co.uk',
195 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
196 'rtmp_live': False,
197 'ext': 'flv',
198 'format_id': supplier,
199 })
200 return formats
201
202 def _extract_items(self, playlist):
203 return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
204
205 def _extract_medias(self, media_selection):
c056efa2
S
206 error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
207 if error is not None:
208 raise ExtractorError(
209 '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
2e3fd9ec
S
210 return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
211
212 def _extract_connections(self, media):
213 return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
214
215 def _extract_video(self, media, programme_id):
216 formats = []
d97f5cd7 217 vbr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
218 vcodec = media.get('encoding')
219 service = media.get('service')
d97f5cd7
S
220 width = int_or_none(media.get('width'))
221 height = int_or_none(media.get('height'))
222 file_size = int_or_none(media.get('media_file_size'))
2e3fd9ec
S
223 for connection in self._extract_connections(media):
224 conn_formats = self._extract_connection(connection, programme_id)
225 for format in conn_formats:
226 format.update({
227 'format_id': '%s_%s' % (service, format['format_id']),
228 'width': width,
229 'height': height,
230 'vbr': vbr,
231 'vcodec': vcodec,
232 'filesize': file_size,
233 })
234 formats.extend(conn_formats)
235 return formats
236
237 def _extract_audio(self, media, programme_id):
238 formats = []
d97f5cd7 239 abr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
240 acodec = media.get('encoding')
241 service = media.get('service')
242 for connection in self._extract_connections(media):
243 conn_formats = self._extract_connection(connection, programme_id)
244 for format in conn_formats:
245 format.update({
246 'format_id': '%s_%s' % (service, format['format_id']),
247 'abr': abr,
248 'acodec': acodec,
249 })
250 formats.extend(conn_formats)
251 return formats
252
f13b1e7d 253 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
254 subtitles = {}
255 for connection in self._extract_connections(media):
256 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
257 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
258 subtitles[lang] = [
259 {
260 'url': connection.get('href'),
261 'ext': 'ttml',
262 },
f13b1e7d 263 ]
2e3fd9ec 264 return subtitles
082c6c86 265
c056efa2
S
266 def _download_media_selector(self, programme_id):
267 try:
268 media_selection = self._download_xml(
a8b081a0 269 self.mediaselector_url % programme_id,
c056efa2
S
270 programme_id, 'Downloading media selection XML')
271 except ExtractorError as ee:
272 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
974a6146 273 media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 274 else:
c056efa2 275 raise
082c6c86
S
276
277 formats = []
2e3fd9ec
S
278 subtitles = None
279
c056efa2
S
280 for media in self._extract_medias(media_selection):
281 kind = media.get('kind')
282 if kind == 'audio':
283 formats.extend(self._extract_audio(media, programme_id))
284 elif kind == 'video':
285 formats.extend(self._extract_video(media, programme_id))
286 elif kind == 'captions':
f13b1e7d 287 subtitles = self.extract_subtitles(media, programme_id)
2e3fd9ec 288
c056efa2 289 return formats, subtitles
2e3fd9ec 290
ae6986fb
S
291 def _download_playlist(self, playlist_id):
292 try:
293 playlist = self._download_json(
294 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
295 playlist_id, 'Downloading playlist JSON')
296
297 version = playlist.get('defaultAvailableVersion')
298 if version:
299 smp_config = version['smpConfig']
300 title = smp_config['title']
301 description = smp_config['summary']
302 for item in smp_config['items']:
303 kind = item['kind']
304 if kind != 'programme' and kind != 'radioProgramme':
305 continue
306 programme_id = item.get('vpid')
d97f5cd7 307 duration = int_or_none(item.get('duration'))
ae6986fb
S
308 formats, subtitles = self._download_media_selector(programme_id)
309 return programme_id, title, description, duration, formats, subtitles
310 except ExtractorError as ee:
f813928e 311 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
312 raise
313
314 # fallback to legacy playlist
315 playlist = self._download_xml(
931e2d1d
PH
316 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
317 playlist_id, 'Downloading legacy playlist XML')
ae6986fb
S
318
319 no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
320 if no_items is not None:
321 reason = no_items.get('reason')
322 if reason == 'preAvailability':
323 msg = 'Episode %s is not yet available' % playlist_id
324 elif reason == 'postAvailability':
325 msg = 'Episode %s is no longer available' % playlist_id
326 elif reason == 'noMedia':
327 msg = 'Episode %s is not currently available' % playlist_id
328 else:
329 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
330 raise ExtractorError(msg, expected=True)
331
332 for item in self._extract_items(playlist):
333 kind = item.get('kind')
334 if kind != 'programme' and kind != 'radioProgramme':
335 continue
336 title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
337 description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
338 programme_id = item.get('identifier')
d97f5cd7 339 duration = int_or_none(item.get('duration'))
ae6986fb
S
340 formats, subtitles = self._download_media_selector(programme_id)
341
342 return programme_id, title, description, duration, formats, subtitles
343
c056efa2
S
344 def _real_extract(self, url):
345 group_id = self._match_id(url)
346
347 webpage = self._download_webpage(url, group_id, 'Downloading video page')
348
8683b4d8
S
349 programme_id = None
350
351 tviplayer = self._search_regex(
352 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
353 webpage, 'player', default=None)
354
355 if tviplayer:
356 player = self._parse_json(tviplayer, group_id).get('player', {})
357 duration = int_or_none(player.get('duration'))
358 programme_id = player.get('vpid')
359
360 if not programme_id:
361 programme_id = self._search_regex(
362 r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
363
c056efa2 364 if programme_id:
c056efa2 365 formats, subtitles = self._download_media_selector(programme_id)
8683b4d8
S
366 title = self._og_search_title(webpage)
367 description = self._search_regex(
25fa8d66 368 r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
8683b4d8 369 webpage, 'description', fatal=False)
c056efa2 370 else:
ae6986fb 371 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 372
082c6c86
S
373 self._sort_formats(formats)
374
375 return {
2e3fd9ec 376 'id': programme_id,
082c6c86
S
377 'title': title,
378 'description': description,
650cfd0c 379 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
380 'duration': duration,
381 'formats': formats,
2e3fd9ec 382 'subtitles': subtitles,
5f6a1245 383 }
10273d6e 384
385
386class BBCNewsIE(BBCCoUkIE):
387 IE_NAME = 'bbc.com'
388 IE_DESC = 'BBC news'
de939d89 389 _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P<id>[^/]+)$'
10273d6e 390
391 mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
392
393 _TESTS = [{
394 'url': 'http://www.bbc.com/news/world-europe-32668511',
395 'info_dict': {
396 'id': 'world-europe-32668511',
397 'title': 'Russia stages massive WW2 parade despite Western boycott',
398 },
399 'playlist_count': 2,
a3bfddfa 400 }, {
10273d6e 401 'url': 'http://www.bbc.com/news/business-28299555',
402 'info_dict': {
403 'id': 'business-28299555',
404 'title': 'Farnborough Airshow: Video highlights',
405 },
406 'playlist_count': 9,
a3bfddfa 407 }, {
10273d6e 408 'url': 'http://www.bbc.com/news/world-europe-32041533',
409 'note': 'Video',
410 'info_dict': {
411 'id': 'p02mprgb',
412 'ext': 'mp4',
413 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
414 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
415 'duration': 47,
da92eeae 416 'upload_date': '20150324',
417 'uploader': 'BBC News',
10273d6e 418 },
419 'params': {
420 'skip_download': True,
421 }
a3bfddfa 422 }, {
de939d89 423 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
424 'note': 'Video',
425 'info_dict': {
426 'id': 'NA',
427 'ext': 'mp4',
da92eeae 428 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
429 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
de939d89 430 'duration': 47,
da92eeae 431 'upload_date': '20150615',
432 'uploader': 'BBC News',
de939d89 433 },
434 'params': {
435 'skip_download': True,
436 }
a3bfddfa 437 }, {
de939d89 438 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
439 'note': 'Video',
440 'info_dict': {
441 'id': '39275083',
442 'ext': 'mp4',
da92eeae 443 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
444 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
de939d89 445 'duration': 87,
da92eeae 446 'upload_date': '20150619',
447 'uploader': 'BBC News',
de939d89 448 },
449 'params': {
450 'skip_download': True,
451 }
10273d6e 452 }]
453
454 def _real_extract(self, url):
455 list_id = self._match_id(url)
456 webpage = self._download_webpage(url, list_id)
457
de939d89 458 list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'list title')
10273d6e 459
460 pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
461 if pubdate:
a3bfddfa 462 pubdate = pubdate.replace('-', '')
10273d6e 463
464 ret = []
de939d89 465 jsent = []
466
10273d6e 467 # works with bbc.com/news/something-something-123456 articles
de939d89 468 jsent = map(
a3bfddfa 469 lambda m: self._parse_json(m, list_id),
470 re.findall(r"data-media-meta='({[^']+})'", webpage)
de939d89 471 )
472
473 if len(jsent) == 0:
a3bfddfa 474 # http://www.bbc.com/news/video_and_audio/international
475 # and single-video articles
476 masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None)
477 if masset:
478 jmasset = self._parse_json(masset, list_id)
479 for key, val in jmasset.get('videos', {}).items():
480 for skey, sval in val.items():
481 sval['id'] = skey
482 jsent.append(sval)
de939d89 483
484 if len(jsent) == 0:
a3bfddfa 485 # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
486 # in http://www.bbc.com/news/video_and_audio/international
487 # prone to breaking if entries have sourceFiles list
488 jsent = map(
489 lambda m: self._parse_json(m, list_id),
490 re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
491 )
10273d6e 492
de939d89 493 if len(jsent) == 0:
a3bfddfa 494 raise ExtractorError('No video found', expected=True)
10273d6e 495
de939d89 496 for jent in jsent:
75ab0ebc 497 programme_id = jent.get('externalId')
2a282a3b 498 xml_url = jent.get('href')
de939d89 499
a3bfddfa 500 title = jent.get('caption', '')
36da4879 501 if title == '':
a3bfddfa 502 title = list_title
10273d6e 503
77c975f5 504 duration = parse_duration(jent.get('duration'))
da92eeae 505 description = list_title
36da4879 506 if jent.get('caption', '') != '':
a3bfddfa 507 description += ' - ' + jent.get('caption')
10273d6e 508 thumbnail = None
a3bfddfa 509 if jent.get('image') is not None:
510 thumbnail = jent['image'].get('href')
10273d6e 511
de939d89 512 formats = []
513 subtitles = []
514
10273d6e 515 if programme_id:
a3bfddfa 516 formats, subtitles = self._download_media_selector(programme_id)
517 elif jent.get('sourceFiles') is not None:
518 # mediaselector not used at
519 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu
520 for key, val in jent['sourceFiles'].items():
521 formats.append({
522 'ext': val.get('encoding'),
523 'url': val.get('url'),
524 'filesize': int(val.get('filesize')),
525 'format_id': key
526 })
10273d6e 527 elif xml_url:
a3bfddfa 528 # Cheap fallback
529 # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
530 xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
531 programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
532 formats, subtitles = self._download_media_selector(programme_id)
de939d89 533
534 if len(formats) == 0:
a3bfddfa 535 raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n')
536
10273d6e 537 self._sort_formats(formats)
538
a3bfddfa 539 id = jent.get('id') if programme_id is None else programme_id
540 if id is None:
541 id = 'NA'
da92eeae 542
a3bfddfa 543 ret.append({
da92eeae 544 'id': id,
10273d6e 545 'uploader': 'BBC News',
546 'upload_date': pubdate,
547 'title': title,
548 'description': description,
549 'thumbnail': thumbnail,
550 'duration': duration,
551 'formats': formats,
552 'subtitles': subtitles,
a3bfddfa 553 })
10273d6e 554
555 if len(ret) > 0:
a3bfddfa 556 return self.playlist_result(ret, list_id, list_title)
10273d6e 557 raise ExtractorError('No video found', expected=True)