]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
Add support for https for all extractors as preventive and future-proof measure
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
082c6c86 5
f13b1e7d 6from .common import InfoExtractor
8683b4d8
S
7from ..utils import (
8 ExtractorError,
9afa1770 9 float_or_none,
8683b4d8 10 int_or_none,
9afa1770
S
11 parse_duration,
12 parse_iso8601,
dab062fb 13 unescapeHTML,
8683b4d8 14)
36e6f62c
JMF
15from ..compat import (
16 compat_etree_fromstring,
17 compat_HTTPError,
18)
082c6c86 19
d12a1a47 20
f13b1e7d 21class BBCCoUkIE(InfoExtractor):
082c6c86 22 IE_NAME = 'bbc.co.uk'
2e3fd9ec 23 IE_DESC = 'BBC iPlayer'
22d7368d 24 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
25 _VALID_URL = r'''(?x)
26 https?://
27 (?:www\.)?bbc\.co\.uk/
28 (?:
29 programmes/(?!articles/)|
30 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
31 music/clips[/#]|
32 radio/player/
33 )
34 (?P<id>%s)
35 ''' % _ID_REGEX
082c6c86 36
d12a1a47 37 _MEDIASELECTOR_URLS = [
26ccc68b
S
38 # Provides HQ HLS streams with even better quality that pc mediaset but fails
39 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 40 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 41 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
42 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
43 ]
a8b081a0 44
e6174ee9
S
45 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
46 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
47
48 _NAMESPACES = (
49 _MEDIASELECTION_NS,
50 _EMP_PLAYLIST_NS,
51 )
52
2e3fd9ec
S
53 _TESTS = [
54 {
f2d0fc68 55 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 56 'info_dict': {
f2d0fc68 57 'id': 'b039d07m',
2e3fd9ec 58 'ext': 'flv',
679bacf0 59 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 60 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
61 },
62 'params': {
63 # rtmp download
64 'skip_download': True,
65 }
082c6c86 66 },
2e3fd9ec
S
67 {
68 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
69 'info_dict': {
70 'id': 'b00yng1d',
71 'ext': 'flv',
72 'title': 'The Man in Black: Series 3: The Printed Name',
73 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
74 'duration': 1800,
75 },
76 'params': {
77 # rtmp download
78 'skip_download': True,
c7f0177f
S
79 },
80 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
81 },
82 {
83 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
84 'info_dict': {
85 'id': 'b00yng1d',
86 'ext': 'flv',
17968e44 87 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 88 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 89 'duration': 5100,
2e3fd9ec
S
90 },
91 'params': {
92 # rtmp download
93 'skip_download': True,
94 },
95 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
96 },
97 {
98 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
99 'info_dict': {
100 'id': 'b03k3pb7',
101 'ext': 'flv',
102 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
103 'description': '2. Invasion',
104 'duration': 3600,
105 },
106 'params': {
107 # rtmp download
108 'skip_download': True,
109 },
110 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
111 }, {
112 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
113 'info_dict': {
114 'id': 'b04v209v',
115 'ext': 'flv',
116 'title': 'Pete Tong, The Essential New Tune Special',
117 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
118 'duration': 10800,
119 },
120 'params': {
121 # rtmp download
122 'skip_download': True,
a3ef0e1c
YCH
123 },
124 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 125 }, {
5aa535c3 126 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
127 'note': 'Audio',
128 'info_dict': {
5aa535c3 129 'id': 'p022h44j',
c7e67594 130 'ext': 'flv',
5aa535c3
S
131 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
132 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
133 'duration': 227,
c7e67594
S
134 },
135 'params': {
136 # rtmp download
137 'skip_download': True,
138 }
139 }, {
140 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
141 'note': 'Video',
142 'info_dict': {
143 'id': 'p025c103',
144 'ext': 'flv',
145 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
146 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
147 'duration': 226,
148 },
149 'params': {
150 # rtmp download
151 'skip_download': True,
152 }
e68ae99a
S
153 }, {
154 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
155 'info_dict': {
156 'id': 'p02n76xf',
157 'ext': 'flv',
158 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
159 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
160 'duration': 3540,
161 },
162 'params': {
163 # rtmp download
164 'skip_download': True,
165 },
166 'skip': 'geolocation',
25fa8d66
YCH
167 }, {
168 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
169 'info_dict': {
170 'id': 'b05zmgw1',
171 'ext': 'flv',
172 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
173 'title': 'Royal Academy Summer Exhibition',
174 'duration': 3540,
175 },
176 'params': {
177 # rtmp download
178 'skip_download': True,
179 },
180 'skip': 'geolocation',
54914380
S
181 }, {
182 # iptv-all mediaset fails with geolocation however there is no geo restriction
183 # for this programme at all
5aa535c3 184 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 185 'info_dict': {
5aa535c3 186 'id': 'b06rkms3',
54914380 187 'ext': 'flv',
5aa535c3
S
188 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
189 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
190 },
191 'params': {
192 # rtmp download
193 'skip_download': True,
194 },
1ac6e794
S
195 }, {
196 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
197 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
198 'info_dict': {
199 'id': 'p028bfkj',
200 'ext': 'flv',
201 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
202 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
203 },
204 'params': {
205 # rtmp download
206 'skip_download': True,
207 },
31763975
S
208 }, {
209 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
210 'only_matching': True,
c7e67594
S
211 }, {
212 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
213 'only_matching': True,
0692ef86
S
214 }, {
215 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
216 'only_matching': True,
f20a11ed
S
217 }, {
218 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
219 'only_matching': True,
ae6986fb 220 }
2e3fd9ec
S
221 ]
222
d12a1a47
S
223 class MediaSelectionError(Exception):
224 def __init__(self, id):
225 self.id = id
226
2e3fd9ec
S
227 def _extract_asx_playlist(self, connection, programme_id):
228 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
229 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
230
231 def _extract_connection(self, connection, programme_id):
232 formats = []
e6174ee9 233 kind = connection.get('kind')
2e3fd9ec
S
234 protocol = connection.get('protocol')
235 supplier = connection.get('supplier')
236 if protocol == 'http':
237 href = connection.get('href')
7a896817 238 transfer_format = connection.get('transferFormat')
2e3fd9ec
S
239 # ASX playlist
240 if supplier == 'asx':
241 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
242 formats.append({
243 'url': ref,
244 'format_id': 'ref%s_%s' % (i, supplier),
245 })
7a896817
S
246 # Skip DASH until supported
247 elif transfer_format == 'dash':
248 pass
d1c694ea 249 elif transfer_format == 'hls':
7e5edcfd 250 formats.extend(self._extract_m3u8_formats(
d1c694ea 251 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
7e5edcfd 252 m3u8_id=supplier, fatal=False))
2e3fd9ec
S
253 # Direct link
254 else:
255 formats.append({
256 'url': href,
e6174ee9 257 'format_id': supplier or kind or protocol,
2e3fd9ec
S
258 })
259 elif protocol == 'rtmp':
260 application = connection.get('application', 'ondemand')
261 auth_string = connection.get('authString')
262 identifier = connection.get('identifier')
263 server = connection.get('server')
264 formats.append({
265 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
266 'play_path': identifier,
267 'app': '%s?%s' % (application, auth_string),
268 'page_url': 'http://www.bbc.co.uk',
269 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
270 'rtmp_live': False,
271 'ext': 'flv',
272 'format_id': supplier,
273 })
274 return formats
275
276 def _extract_items(self, playlist):
e6174ee9
S
277 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
278
279 def _findall_ns(self, element, xpath):
280 elements = []
281 for ns in self._NAMESPACES:
282 elements.extend(element.findall(xpath % ns))
283 return elements
2e3fd9ec
S
284
285 def _extract_medias(self, media_selection):
e6174ee9
S
286 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
287 if error is None:
288 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 289 if error is not None:
d12a1a47 290 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 291 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
292
293 def _extract_connections(self, media):
e6174ee9 294 return self._findall_ns(media, './{%s}connection')
2e3fd9ec
S
295
296 def _extract_video(self, media, programme_id):
297 formats = []
d97f5cd7 298 vbr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
299 vcodec = media.get('encoding')
300 service = media.get('service')
d97f5cd7
S
301 width = int_or_none(media.get('width'))
302 height = int_or_none(media.get('height'))
303 file_size = int_or_none(media.get('media_file_size'))
2e3fd9ec
S
304 for connection in self._extract_connections(media):
305 conn_formats = self._extract_connection(connection, programme_id)
306 for format in conn_formats:
307 format.update({
2e3fd9ec
S
308 'width': width,
309 'height': height,
310 'vbr': vbr,
311 'vcodec': vcodec,
312 'filesize': file_size,
313 })
e6174ee9
S
314 if service:
315 format['format_id'] = '%s_%s' % (service, format['format_id'])
2e3fd9ec
S
316 formats.extend(conn_formats)
317 return formats
318
319 def _extract_audio(self, media, programme_id):
320 formats = []
d97f5cd7 321 abr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
322 acodec = media.get('encoding')
323 service = media.get('service')
324 for connection in self._extract_connections(media):
325 conn_formats = self._extract_connection(connection, programme_id)
326 for format in conn_formats:
327 format.update({
328 'format_id': '%s_%s' % (service, format['format_id']),
329 'abr': abr,
330 'acodec': acodec,
331 })
332 formats.extend(conn_formats)
333 return formats
334
f13b1e7d 335 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
336 subtitles = {}
337 for connection in self._extract_connections(media):
338 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
339 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
340 subtitles[lang] = [
341 {
342 'url': connection.get('href'),
343 'ext': 'ttml',
344 },
f13b1e7d 345 ]
2e3fd9ec 346 return subtitles
082c6c86 347
d12a1a47
S
348 def _raise_extractor_error(self, media_selection_error):
349 raise ExtractorError(
350 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
351 expected=True)
352
c056efa2 353 def _download_media_selector(self, programme_id):
d12a1a47
S
354 last_exception = None
355 for mediaselector_url in self._MEDIASELECTOR_URLS:
356 try:
357 return self._download_media_selector_url(
358 mediaselector_url % programme_id, programme_id)
359 except BBCCoUkIE.MediaSelectionError as e:
d781e293 360 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
361 last_exception = e
362 continue
363 self._raise_extractor_error(e)
364 self._raise_extractor_error(last_exception)
9afa1770
S
365
366 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
367 try:
368 media_selection = self._download_xml(
9afa1770 369 url, programme_id, 'Downloading media selection XML')
c056efa2 370 except ExtractorError as ee:
d781e293 371 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 372 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 373 else:
c056efa2 374 raise
9afa1770 375 return self._process_media_selector(media_selection, programme_id)
082c6c86 376
9afa1770 377 def _process_media_selector(self, media_selection, programme_id):
082c6c86 378 formats = []
2e3fd9ec
S
379 subtitles = None
380
c056efa2
S
381 for media in self._extract_medias(media_selection):
382 kind = media.get('kind')
383 if kind == 'audio':
384 formats.extend(self._extract_audio(media, programme_id))
385 elif kind == 'video':
386 formats.extend(self._extract_video(media, programme_id))
387 elif kind == 'captions':
f13b1e7d 388 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 389 return formats, subtitles
2e3fd9ec 390
ae6986fb
S
391 def _download_playlist(self, playlist_id):
392 try:
393 playlist = self._download_json(
394 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
395 playlist_id, 'Downloading playlist JSON')
396
397 version = playlist.get('defaultAvailableVersion')
398 if version:
399 smp_config = version['smpConfig']
400 title = smp_config['title']
401 description = smp_config['summary']
402 for item in smp_config['items']:
403 kind = item['kind']
404 if kind != 'programme' and kind != 'radioProgramme':
405 continue
406 programme_id = item.get('vpid')
d97f5cd7 407 duration = int_or_none(item.get('duration'))
ae6986fb
S
408 formats, subtitles = self._download_media_selector(programme_id)
409 return programme_id, title, description, duration, formats, subtitles
410 except ExtractorError as ee:
f813928e 411 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
412 raise
413
414 # fallback to legacy playlist
9afa1770
S
415 return self._process_legacy_playlist(playlist_id)
416
417 def _process_legacy_playlist_url(self, url, display_id):
418 playlist = self._download_legacy_playlist_url(url, display_id)
419 return self._extract_from_legacy_playlist(playlist, display_id)
420
421 def _process_legacy_playlist(self, playlist_id):
422 return self._process_legacy_playlist_url(
423 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
424
425 def _download_legacy_playlist_url(self, url, playlist_id=None):
426 return self._download_xml(
427 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 428
9afa1770 429 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 430 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
431 if no_items is not None:
432 reason = no_items.get('reason')
433 if reason == 'preAvailability':
434 msg = 'Episode %s is not yet available' % playlist_id
435 elif reason == 'postAvailability':
436 msg = 'Episode %s is no longer available' % playlist_id
437 elif reason == 'noMedia':
438 msg = 'Episode %s is not currently available' % playlist_id
439 else:
440 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
441 raise ExtractorError(msg, expected=True)
442
443 for item in self._extract_items(playlist):
444 kind = item.get('kind')
445 if kind != 'programme' and kind != 'radioProgramme':
446 continue
e6174ee9
S
447 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
448 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 449 description = description_el.text if description_el is not None else None
9afa1770
S
450
451 def get_programme_id(item):
452 def get_from_attributes(item):
453 for p in('identifier', 'group'):
454 value = item.get(p)
455 if value and re.match(r'^[pb][\da-z]{7}$', value):
456 return value
457 get_from_attributes(item)
e6174ee9 458 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
459 if mediator is not None:
460 return get_from_attributes(mediator)
461
462 programme_id = get_programme_id(item)
d97f5cd7 463 duration = int_or_none(item.get('duration'))
e6174ee9
S
464
465 if programme_id:
466 formats, subtitles = self._download_media_selector(programme_id)
467 else:
468 formats, subtitles = self._process_media_selector(item, playlist_id)
469 programme_id = playlist_id
ae6986fb
S
470
471 return programme_id, title, description, duration, formats, subtitles
472
c056efa2
S
473 def _real_extract(self, url):
474 group_id = self._match_id(url)
475
476 webpage = self._download_webpage(url, group_id, 'Downloading video page')
477
8683b4d8 478 programme_id = None
679bacf0 479 duration = None
8683b4d8
S
480
481 tviplayer = self._search_regex(
482 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
483 webpage, 'player', default=None)
484
485 if tviplayer:
486 player = self._parse_json(tviplayer, group_id).get('player', {})
487 duration = int_or_none(player.get('duration'))
488 programme_id = player.get('vpid')
489
490 if not programme_id:
491 programme_id = self._search_regex(
22d7368d 492 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 493
c056efa2 494 if programme_id:
c056efa2 495 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 496 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
497 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
498 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 499 description = self._search_regex(
a8534274
S
500 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
501 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
502 webpage, 'description', default=None)
503 if not description:
504 description = self._html_search_meta('description', webpage)
c056efa2 505 else:
ae6986fb 506 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 507
082c6c86
S
508 self._sort_formats(formats)
509
510 return {
2e3fd9ec 511 'id': programme_id,
082c6c86
S
512 'title': title,
513 'description': description,
650cfd0c 514 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
515 'duration': duration,
516 'formats': formats,
2e3fd9ec 517 'subtitles': subtitles,
5f6a1245 518 }
10273d6e 519
520
9afa1770
S
521class BBCIE(BBCCoUkIE):
522 IE_NAME = 'bbc'
523 IE_DESC = 'BBC'
524 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 525
d12a1a47 526 _MEDIASELECTOR_URLS = [
55ebae26
S
527 # Provides HQ HLS streams but fails with geolocation in some cases when it's
528 # even not geo restricted at all
529 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
530 # Provides more formats, namely direct mp4 links, but fails on some videos with
531 # notukerror for non UK (?) users (e.g.
532 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
533 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
534 # Provides fewer formats, but works everywhere for everybody (hopefully)
535 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
536 ]
10273d6e 537
538 _TESTS = [{
6a747190 539 # article with multiple videos embedded with data-playable containing vpids
10273d6e 540 'url': 'http://www.bbc.com/news/world-europe-32668511',
541 'info_dict': {
542 'id': 'world-europe-32668511',
543 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 544 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 545 },
546 'playlist_count': 2,
a3bfddfa 547 }, {
6a747190 548 # article with multiple videos embedded with data-playable (more videos)
10273d6e 549 'url': 'http://www.bbc.com/news/business-28299555',
550 'info_dict': {
551 'id': 'business-28299555',
552 'title': 'Farnborough Airshow: Video highlights',
9afa1770 553 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 554 },
555 'playlist_count': 9,
9afa1770 556 'skip': 'Save time',
88ed52ae
S
557 }, {
558 # article with multiple videos embedded with `new SMP()`
6a747190 559 # broken
88ed52ae
S
560 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
561 'info_dict': {
562 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 563 'title': 'BUGGER',
88ed52ae
S
564 },
565 'playlist_count': 18,
a3bfddfa 566 }, {
6a747190 567 # single video embedded with data-playable containing vpid
10273d6e 568 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 569 'info_dict': {
570 'id': 'p02mprgb',
55ebae26 571 'ext': 'mp4',
10273d6e 572 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 573 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 574 'duration': 47,
9afa1770 575 'timestamp': 1427219242,
da92eeae 576 'upload_date': '20150324',
10273d6e 577 },
578 'params': {
9afa1770 579 # rtmp download
10273d6e 580 'skip_download': True,
581 }
a3bfddfa 582 }, {
6a747190
S
583 # article with single video embedded with data-playable containing XML playlist
584 # with direct video links as progressiveDownloadUrl (for now these are extracted)
585 # and playlist with f4m and m3u8 as streamingUrl
de939d89 586 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 587 'info_dict': {
9afa1770 588 'id': '150615_telabyad_kentin_cogu',
de939d89 589 'ext': 'mp4',
9afa1770 590 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
9afa1770 591 'timestamp': 1434397334,
da92eeae 592 'upload_date': '20150615',
de939d89 593 },
594 'params': {
595 'skip_download': True,
596 }
c936d8cc 597 }, {
6a747190 598 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 599 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 600 'info_dict': {
9afa1770 601 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 602 'ext': 'mp4',
9afa1770 603 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
9afa1770 604 'timestamp': 1434713142,
da92eeae 605 'upload_date': '20150619',
de939d89 606 },
607 'params': {
608 'skip_download': True,
609 }
a346b1ff
S
610 }, {
611 # single video from video playlist embedded with vxp-playlist-data JSON
612 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
613 'info_dict': {
614 'id': 'p02w6qjc',
55ebae26 615 'ext': 'mp4',
a346b1ff
S
616 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
617 'duration': 56,
0bc4ee60 618 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
619 },
620 'params': {
621 'skip_download': True,
622 }
9afa1770
S
623 }, {
624 # single video story with digitalData
625 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
626 'info_dict': {
627 'id': 'p02q6gc4',
628 'ext': 'flv',
629 'title': 'Sri Lanka’s spicy secret',
630 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
631 'timestamp': 1437674293,
632 'upload_date': '20150723',
633 },
634 'params': {
635 # rtmp download
636 'skip_download': True,
637 }
638 }, {
639 # single video story without digitalData
640 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
641 'info_dict': {
642 'id': 'p018zqqg',
55ebae26 643 'ext': 'mp4',
9afa1770
S
644 'title': 'Hyundai Santa Fe Sport: Rock star',
645 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
646 'timestamp': 1415867444,
647 'upload_date': '20141113',
9afa1770
S
648 },
649 'params': {
650 # rtmp download
651 'skip_download': True,
652 }
653 }, {
6a747190 654 # single video with playlist.sxml URL in playlist param
9afa1770
S
655 'url': 'http://www.bbc.com/sport/0/football/33653409',
656 'info_dict': {
657 'id': 'p02xycnp',
55ebae26 658 'ext': 'mp4',
9afa1770 659 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 660 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
661 'duration': 140,
662 },
663 'params': {
664 # rtmp download
665 'skip_download': True,
666 }
b5d48cb1 667 }, {
6a747190 668 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
669 'url': 'http://www.bbc.com/sport/0/football/34475836',
670 'info_dict': {
671 'id': '34475836',
450b233c 672 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
b5d48cb1
S
673 },
674 'playlist_count': 3,
450b233c
S
675 }, {
676 # school report article with single video
677 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
678 'info_dict': {
679 'id': '35744779',
680 'title': 'School which breaks down barriers in Jerusalem',
681 },
682 'playlist_count': 1,
9afa1770
S
683 }, {
684 # single video with playlist URL from weather section
685 'url': 'http://www.bbc.com/weather/features/33601775',
686 'only_matching': True,
687 }, {
688 # custom redirection to www.bbc.com
689 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
690 'only_matching': True,
10273d6e 691 }]
692
9afa1770
S
693 @classmethod
694 def suitable(cls, url):
a65402ef 695 return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
9afa1770
S
696
697 def _extract_from_media_meta(self, media_meta, video_id):
698 # Direct links to media in media metadata (e.g.
699 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
700 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
701 source_files = media_meta.get('sourceFiles')
702 if source_files:
703 return [{
704 'url': f['url'],
705 'format_id': format_id,
706 'ext': f.get('encoding'),
707 'tbr': float_or_none(f.get('bitrate'), 1000),
708 'filesize': int_or_none(f.get('filesize')),
709 } for format_id, f in source_files.items() if f.get('url')], []
710
711 programme_id = media_meta.get('externalId')
712 if programme_id:
713 return self._download_media_selector(programme_id)
714
715 # Process playlist.sxml as legacy playlist
716 href = media_meta.get('href')
717 if href:
718 playlist = self._download_legacy_playlist_url(href)
719 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
720 return formats, subtitles
721
722 return [], []
723
baf39a1a
S
724 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
725 programme_id, title, description, duration, formats, subtitles = \
726 self._process_legacy_playlist_url(url, playlist_id)
727 self._sort_formats(formats)
728 return {
729 'id': programme_id,
730 'title': title,
731 'description': description,
732 'duration': duration,
733 'timestamp': timestamp,
734 'formats': formats,
735 'subtitles': subtitles,
736 }
737
10273d6e 738 def _real_extract(self, url):
9afa1770
S
739 playlist_id = self._match_id(url)
740
741 webpage = self._download_webpage(url, playlist_id)
742
350e02d4
YCH
743 json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
744 timestamp = json_ld_info.get('timestamp')
0e832c2c 745
350e02d4 746 playlist_title = json_ld_info.get('title')
0e832c2c
S
747 if not playlist_title:
748 playlist_title = self._og_search_title(
749 webpage, default=None) or self._html_search_regex(
750 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
751 if playlist_title:
752 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
753
754 playlist_description = json_ld_info.get(
755 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
756
757 if not timestamp:
758 timestamp = parse_iso8601(self._search_regex(
759 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
760 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 761 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 762 webpage, 'date', default=None))
9afa1770 763
78f9d843
S
764 entries = []
765
de665713
S
766 # article with multiple videos embedded with playlist.sxml (e.g.
767 # http://www.bbc.com/sport/0/football/34475836)
768 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 769 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 770 if playlists:
baf39a1a
S
771 entries = [
772 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
773 for playlist_url in playlists]
de939d89 774
78f9d843
S
775 # news article with multiple videos embedded with data-playable
776 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
777 if data_playables:
778 for _, data_playable_json in data_playables:
779 data_playable = self._parse_json(
780 unescapeHTML(data_playable_json), playlist_id, fatal=False)
781 if not data_playable:
782 continue
baf39a1a
S
783 settings = data_playable.get('settings', {})
784 if settings:
78f9d843
S
785 # data-playable with video vpid in settings.playlistObject.items (e.g.
786 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
787 playlist_object = settings.get('playlistObject', {})
788 if playlist_object:
789 items = playlist_object.get('items')
790 if items and isinstance(items, list):
78f9d843
S
791 title = playlist_object['title']
792 description = playlist_object.get('summary')
baf39a1a
S
793 duration = int_or_none(items[0].get('duration'))
794 programme_id = items[0].get('vpid')
78f9d843
S
795 formats, subtitles = self._download_media_selector(programme_id)
796 self._sort_formats(formats)
797 entries.append({
798 'id': programme_id,
799 'title': title,
800 'description': description,
801 'timestamp': timestamp,
802 'duration': duration,
803 'formats': formats,
804 'subtitles': subtitles,
805 })
806 else:
807 # data-playable without vpid but with a playlist.sxml URLs
808 # in otherSettings.playlist (e.g.
809 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
810 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
811 if playlist:
812 entries.append(self._extract_from_playlist_sxml(
813 playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
814
815 if entries:
78f9d843
S
816 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
817
818 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
819 programme_id = self._search_regex(
22d7368d
S
820 [r'data-video-player-vpid="(%s)"' % self._ID_REGEX,
821 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
822 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 823 webpage, 'vpid', default=None)
dab062fb 824
9afa1770
S
825 if programme_id:
826 formats, subtitles = self._download_media_selector(programme_id)
827 self._sort_formats(formats)
828 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
829 digital_data = self._parse_json(
830 self._search_regex(
831 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
832 programme_id, fatal=False)
833 page_info = digital_data.get('page', {}).get('pageInfo', {})
834 title = page_info.get('pageName') or self._og_search_title(webpage)
835 description = page_info.get('description') or self._og_search_description(webpage)
836 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
837 return {
838 'id': programme_id,
839 'title': title,
840 'description': description,
841 'timestamp': timestamp,
842 'formats': formats,
843 'subtitles': subtitles,
844 }
a3bfddfa 845
88ed52ae
S
846 def extract_all(pattern):
847 return list(filter(None, map(
848 lambda s: self._parse_json(s, playlist_id, fatal=False),
849 re.findall(pattern, webpage))))
850
851 # Multiple video article (e.g.
852 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 853 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
854 entries = []
855 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
856 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
857 if embed_url and re.match(EMBED_URL, embed_url):
858 entries.append(embed_url)
859 entries.extend(re.findall(
860 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
861 if entries:
862 return self.playlist_result(
863 [self.url_result(entry, 'BBCCoUk') for entry in entries],
864 playlist_id, playlist_title, playlist_description)
9afa1770
S
865
866 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 867 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
868
869 if not medias:
870 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
871 media_asset = self._search_regex(
872 r'mediaAssetPage\.init\(\s*({.+?}), "/',
873 webpage, 'media asset', default=None)
874 if media_asset:
875 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
876 medias = []
877 for video in media_asset_page.get('videos', {}).values():
878 medias.extend(video.values())
879
880 if not medias:
881 # Multiple video playlist with single `now playing` entry (e.g.
882 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
883 vxp_playlist = self._parse_json(
9afa1770 884 self._search_regex(
a346b1ff
S
885 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
886 webpage, 'playlist data'),
9afa1770 887 playlist_id)
a346b1ff
S
888 playlist_medias = []
889 for item in vxp_playlist:
890 media = item.get('media')
891 if not media:
892 continue
893 playlist_medias.append(media)
894 # Download single video if found media with asset id matching the video id from URL
895 if item.get('advert', {}).get('assetId') == playlist_id:
896 medias = [media]
897 break
898 # Fallback to the whole playlist
899 if not medias:
900 medias = playlist_medias
9afa1770
S
901
902 entries = []
903 for num, media_meta in enumerate(medias, start=1):
904 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
905 if not formats:
906 continue
10273d6e 907 self._sort_formats(formats)
908
9afa1770
S
909 video_id = media_meta.get('externalId')
910 if not video_id:
911 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
912
913 title = media_meta.get('caption')
914 if not title:
915 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
916
917 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 918
9afa1770
S
919 images = []
920 for image in media_meta.get('images', {}).values():
921 images.extend(image.values())
922 if 'image' in media_meta:
923 images.append(media_meta['image'])
924
925 thumbnails = [{
926 'url': image.get('href'),
927 'width': int_or_none(image.get('width')),
928 'height': int_or_none(image.get('height')),
929 } for image in images]
930
931 entries.append({
932 'id': video_id,
10273d6e 933 'title': title,
9afa1770 934 'thumbnails': thumbnails,
10273d6e 935 'duration': duration,
9afa1770 936 'timestamp': timestamp,
10273d6e 937 'formats': formats,
938 'subtitles': subtitles,
a3bfddfa 939 })
10273d6e 940
9afa1770 941 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
942
943
944class BBCCoUkArticleIE(InfoExtractor):
5886b38d 945 _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
946 IE_NAME = 'bbc.co.uk:article'
947 IE_DESC = 'BBC articles'
948
949 _TEST = {
950 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
951 'info_dict': {
952 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
953 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
954 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
955 },
956 'playlist_count': 4,
957 'add_ie': ['BBCCoUk'],
958 }
959
960 def _real_extract(self, url):
961 playlist_id = self._match_id(url)
962
963 webpage = self._download_webpage(url, playlist_id)
964
965 title = self._og_search_title(webpage)
966 description = self._og_search_description(webpage).strip()
967
968 entries = [self.url_result(programme_url) for programme_url in re.findall(
969 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
970
971 return self.playlist_result(entries, playlist_id, title, description)