]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[youtube] Relax URL expansion in description
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
082c6c86 5
f13b1e7d 6from .common import InfoExtractor
8683b4d8
S
7from ..utils import (
8 ExtractorError,
9afa1770 9 float_or_none,
8683b4d8 10 int_or_none,
9afa1770
S
11 parse_duration,
12 parse_iso8601,
dab062fb 13 unescapeHTML,
8683b4d8 14)
36e6f62c
JMF
15from ..compat import (
16 compat_etree_fromstring,
17 compat_HTTPError,
18)
082c6c86 19
d12a1a47 20
f13b1e7d 21class BBCCoUkIE(InfoExtractor):
082c6c86 22 IE_NAME = 'bbc.co.uk'
2e3fd9ec 23 IE_DESC = 'BBC iPlayer'
22d7368d 24 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
25 _VALID_URL = r'''(?x)
26 https?://
27 (?:www\.)?bbc\.co\.uk/
28 (?:
29 programmes/(?!articles/)|
30 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
31 music/clips[/#]|
32 radio/player/
33 )
ded7511a 34 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 35 ''' % _ID_REGEX
082c6c86 36
d12a1a47 37 _MEDIASELECTOR_URLS = [
26ccc68b
S
38 # Provides HQ HLS streams with even better quality that pc mediaset but fails
39 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 40 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 41 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
42 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
43 ]
a8b081a0 44
e6174ee9
S
45 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
46 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
47
48 _NAMESPACES = (
49 _MEDIASELECTION_NS,
50 _EMP_PLAYLIST_NS,
51 )
52
2e3fd9ec
S
53 _TESTS = [
54 {
f2d0fc68 55 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 56 'info_dict': {
f2d0fc68 57 'id': 'b039d07m',
2e3fd9ec 58 'ext': 'flv',
679bacf0 59 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 60 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
61 },
62 'params': {
63 # rtmp download
64 'skip_download': True,
65 }
082c6c86 66 },
2e3fd9ec
S
67 {
68 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
69 'info_dict': {
70 'id': 'b00yng1d',
71 'ext': 'flv',
72 'title': 'The Man in Black: Series 3: The Printed Name',
73 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
74 'duration': 1800,
75 },
76 'params': {
77 # rtmp download
78 'skip_download': True,
c7f0177f
S
79 },
80 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
81 },
82 {
83 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
84 'info_dict': {
85 'id': 'b00yng1d',
86 'ext': 'flv',
17968e44 87 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 88 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 89 'duration': 5100,
2e3fd9ec
S
90 },
91 'params': {
92 # rtmp download
93 'skip_download': True,
94 },
95 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
96 },
97 {
98 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
99 'info_dict': {
100 'id': 'b03k3pb7',
101 'ext': 'flv',
102 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
103 'description': '2. Invasion',
104 'duration': 3600,
105 },
106 'params': {
107 # rtmp download
108 'skip_download': True,
109 },
110 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
111 }, {
112 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
113 'info_dict': {
114 'id': 'b04v209v',
115 'ext': 'flv',
116 'title': 'Pete Tong, The Essential New Tune Special',
117 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
118 'duration': 10800,
119 },
120 'params': {
121 # rtmp download
122 'skip_download': True,
a3ef0e1c
YCH
123 },
124 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 125 }, {
5aa535c3 126 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
127 'note': 'Audio',
128 'info_dict': {
5aa535c3 129 'id': 'p022h44j',
c7e67594 130 'ext': 'flv',
5aa535c3
S
131 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
132 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
133 'duration': 227,
c7e67594
S
134 },
135 'params': {
136 # rtmp download
137 'skip_download': True,
138 }
139 }, {
140 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
141 'note': 'Video',
142 'info_dict': {
143 'id': 'p025c103',
144 'ext': 'flv',
145 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
146 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
147 'duration': 226,
148 },
149 'params': {
150 # rtmp download
151 'skip_download': True,
152 }
e68ae99a
S
153 }, {
154 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
155 'info_dict': {
156 'id': 'p02n76xf',
157 'ext': 'flv',
158 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
159 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
160 'duration': 3540,
161 },
162 'params': {
163 # rtmp download
164 'skip_download': True,
165 },
166 'skip': 'geolocation',
25fa8d66
YCH
167 }, {
168 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
169 'info_dict': {
170 'id': 'b05zmgw1',
171 'ext': 'flv',
172 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
173 'title': 'Royal Academy Summer Exhibition',
174 'duration': 3540,
175 },
176 'params': {
177 # rtmp download
178 'skip_download': True,
179 },
180 'skip': 'geolocation',
54914380
S
181 }, {
182 # iptv-all mediaset fails with geolocation however there is no geo restriction
183 # for this programme at all
5aa535c3 184 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 185 'info_dict': {
5aa535c3 186 'id': 'b06rkms3',
54914380 187 'ext': 'flv',
5aa535c3
S
188 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
189 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
190 },
191 'params': {
192 # rtmp download
193 'skip_download': True,
194 },
573c3527 195 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
196 }, {
197 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
198 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
199 'info_dict': {
200 'id': 'p028bfkj',
201 'ext': 'flv',
202 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
203 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
204 },
205 'params': {
206 # rtmp download
207 'skip_download': True,
208 },
31763975
S
209 }, {
210 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
211 'only_matching': True,
c7e67594
S
212 }, {
213 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
214 'only_matching': True,
0692ef86
S
215 }, {
216 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
217 'only_matching': True,
f20a11ed
S
218 }, {
219 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
220 'only_matching': True,
ae6986fb 221 }
2e3fd9ec
S
222 ]
223
d12a1a47
S
224 class MediaSelectionError(Exception):
225 def __init__(self, id):
226 self.id = id
227
2e3fd9ec
S
228 def _extract_asx_playlist(self, connection, programme_id):
229 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
230 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
231
232 def _extract_connection(self, connection, programme_id):
233 formats = []
e6174ee9 234 kind = connection.get('kind')
2e3fd9ec
S
235 protocol = connection.get('protocol')
236 supplier = connection.get('supplier')
237 if protocol == 'http':
238 href = connection.get('href')
7a896817 239 transfer_format = connection.get('transferFormat')
2e3fd9ec
S
240 # ASX playlist
241 if supplier == 'asx':
242 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
243 formats.append({
244 'url': ref,
245 'format_id': 'ref%s_%s' % (i, supplier),
246 })
7a896817
S
247 # Skip DASH until supported
248 elif transfer_format == 'dash':
249 pass
d1c694ea 250 elif transfer_format == 'hls':
7e5edcfd 251 formats.extend(self._extract_m3u8_formats(
d1c694ea 252 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
7e5edcfd 253 m3u8_id=supplier, fatal=False))
2e3fd9ec
S
254 # Direct link
255 else:
256 formats.append({
257 'url': href,
e6174ee9 258 'format_id': supplier or kind or protocol,
2e3fd9ec
S
259 })
260 elif protocol == 'rtmp':
261 application = connection.get('application', 'ondemand')
262 auth_string = connection.get('authString')
263 identifier = connection.get('identifier')
264 server = connection.get('server')
265 formats.append({
266 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
267 'play_path': identifier,
268 'app': '%s?%s' % (application, auth_string),
269 'page_url': 'http://www.bbc.co.uk',
270 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
271 'rtmp_live': False,
272 'ext': 'flv',
273 'format_id': supplier,
274 })
275 return formats
276
277 def _extract_items(self, playlist):
e6174ee9
S
278 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
279
280 def _findall_ns(self, element, xpath):
281 elements = []
282 for ns in self._NAMESPACES:
283 elements.extend(element.findall(xpath % ns))
284 return elements
2e3fd9ec
S
285
286 def _extract_medias(self, media_selection):
e6174ee9
S
287 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
288 if error is None:
289 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 290 if error is not None:
d12a1a47 291 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 292 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
293
294 def _extract_connections(self, media):
e6174ee9 295 return self._findall_ns(media, './{%s}connection')
2e3fd9ec
S
296
297 def _extract_video(self, media, programme_id):
298 formats = []
d97f5cd7 299 vbr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
300 vcodec = media.get('encoding')
301 service = media.get('service')
d97f5cd7
S
302 width = int_or_none(media.get('width'))
303 height = int_or_none(media.get('height'))
304 file_size = int_or_none(media.get('media_file_size'))
2e3fd9ec
S
305 for connection in self._extract_connections(media):
306 conn_formats = self._extract_connection(connection, programme_id)
307 for format in conn_formats:
308 format.update({
2e3fd9ec
S
309 'width': width,
310 'height': height,
311 'vbr': vbr,
312 'vcodec': vcodec,
313 'filesize': file_size,
314 })
e6174ee9
S
315 if service:
316 format['format_id'] = '%s_%s' % (service, format['format_id'])
2e3fd9ec
S
317 formats.extend(conn_formats)
318 return formats
319
320 def _extract_audio(self, media, programme_id):
321 formats = []
d97f5cd7 322 abr = int_or_none(media.get('bitrate'))
2e3fd9ec
S
323 acodec = media.get('encoding')
324 service = media.get('service')
325 for connection in self._extract_connections(media):
326 conn_formats = self._extract_connection(connection, programme_id)
327 for format in conn_formats:
328 format.update({
329 'format_id': '%s_%s' % (service, format['format_id']),
330 'abr': abr,
331 'acodec': acodec,
bbc26c8a 332 'vcodec': 'none',
2e3fd9ec
S
333 })
334 formats.extend(conn_formats)
335 return formats
336
f13b1e7d 337 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
338 subtitles = {}
339 for connection in self._extract_connections(media):
340 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
341 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
342 subtitles[lang] = [
343 {
344 'url': connection.get('href'),
345 'ext': 'ttml',
346 },
f13b1e7d 347 ]
2e3fd9ec 348 return subtitles
082c6c86 349
d12a1a47
S
350 def _raise_extractor_error(self, media_selection_error):
351 raise ExtractorError(
352 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
353 expected=True)
354
c056efa2 355 def _download_media_selector(self, programme_id):
d12a1a47
S
356 last_exception = None
357 for mediaselector_url in self._MEDIASELECTOR_URLS:
358 try:
359 return self._download_media_selector_url(
360 mediaselector_url % programme_id, programme_id)
361 except BBCCoUkIE.MediaSelectionError as e:
d781e293 362 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
363 last_exception = e
364 continue
365 self._raise_extractor_error(e)
366 self._raise_extractor_error(last_exception)
9afa1770
S
367
368 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
369 try:
370 media_selection = self._download_xml(
9afa1770 371 url, programme_id, 'Downloading media selection XML')
c056efa2 372 except ExtractorError as ee:
d781e293 373 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 374 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 375 else:
c056efa2 376 raise
9afa1770 377 return self._process_media_selector(media_selection, programme_id)
082c6c86 378
9afa1770 379 def _process_media_selector(self, media_selection, programme_id):
082c6c86 380 formats = []
2e3fd9ec
S
381 subtitles = None
382
c056efa2
S
383 for media in self._extract_medias(media_selection):
384 kind = media.get('kind')
385 if kind == 'audio':
386 formats.extend(self._extract_audio(media, programme_id))
387 elif kind == 'video':
388 formats.extend(self._extract_video(media, programme_id))
389 elif kind == 'captions':
f13b1e7d 390 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 391 return formats, subtitles
2e3fd9ec 392
ae6986fb
S
393 def _download_playlist(self, playlist_id):
394 try:
395 playlist = self._download_json(
396 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
397 playlist_id, 'Downloading playlist JSON')
398
399 version = playlist.get('defaultAvailableVersion')
400 if version:
401 smp_config = version['smpConfig']
402 title = smp_config['title']
403 description = smp_config['summary']
404 for item in smp_config['items']:
405 kind = item['kind']
406 if kind != 'programme' and kind != 'radioProgramme':
407 continue
408 programme_id = item.get('vpid')
d97f5cd7 409 duration = int_or_none(item.get('duration'))
ae6986fb
S
410 formats, subtitles = self._download_media_selector(programme_id)
411 return programme_id, title, description, duration, formats, subtitles
412 except ExtractorError as ee:
f813928e 413 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
414 raise
415
416 # fallback to legacy playlist
9afa1770
S
417 return self._process_legacy_playlist(playlist_id)
418
419 def _process_legacy_playlist_url(self, url, display_id):
420 playlist = self._download_legacy_playlist_url(url, display_id)
421 return self._extract_from_legacy_playlist(playlist, display_id)
422
423 def _process_legacy_playlist(self, playlist_id):
424 return self._process_legacy_playlist_url(
425 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
426
427 def _download_legacy_playlist_url(self, url, playlist_id=None):
428 return self._download_xml(
429 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 430
9afa1770 431 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 432 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
433 if no_items is not None:
434 reason = no_items.get('reason')
435 if reason == 'preAvailability':
436 msg = 'Episode %s is not yet available' % playlist_id
437 elif reason == 'postAvailability':
438 msg = 'Episode %s is no longer available' % playlist_id
439 elif reason == 'noMedia':
440 msg = 'Episode %s is not currently available' % playlist_id
441 else:
442 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
443 raise ExtractorError(msg, expected=True)
444
445 for item in self._extract_items(playlist):
446 kind = item.get('kind')
447 if kind != 'programme' and kind != 'radioProgramme':
448 continue
e6174ee9
S
449 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
450 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 451 description = description_el.text if description_el is not None else None
9afa1770
S
452
453 def get_programme_id(item):
454 def get_from_attributes(item):
455 for p in('identifier', 'group'):
456 value = item.get(p)
457 if value and re.match(r'^[pb][\da-z]{7}$', value):
458 return value
459 get_from_attributes(item)
e6174ee9 460 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
461 if mediator is not None:
462 return get_from_attributes(mediator)
463
464 programme_id = get_programme_id(item)
d97f5cd7 465 duration = int_or_none(item.get('duration'))
e6174ee9
S
466
467 if programme_id:
468 formats, subtitles = self._download_media_selector(programme_id)
469 else:
470 formats, subtitles = self._process_media_selector(item, playlist_id)
471 programme_id = playlist_id
ae6986fb
S
472
473 return programme_id, title, description, duration, formats, subtitles
474
c056efa2
S
475 def _real_extract(self, url):
476 group_id = self._match_id(url)
477
478 webpage = self._download_webpage(url, group_id, 'Downloading video page')
479
8683b4d8 480 programme_id = None
679bacf0 481 duration = None
8683b4d8
S
482
483 tviplayer = self._search_regex(
484 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
485 webpage, 'player', default=None)
486
487 if tviplayer:
488 player = self._parse_json(tviplayer, group_id).get('player', {})
489 duration = int_or_none(player.get('duration'))
490 programme_id = player.get('vpid')
491
492 if not programme_id:
493 programme_id = self._search_regex(
22d7368d 494 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 495
c056efa2 496 if programme_id:
c056efa2 497 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 498 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
499 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
500 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 501 description = self._search_regex(
a8534274
S
502 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
503 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
504 webpage, 'description', default=None)
505 if not description:
506 description = self._html_search_meta('description', webpage)
c056efa2 507 else:
ae6986fb 508 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 509
082c6c86
S
510 self._sort_formats(formats)
511
512 return {
2e3fd9ec 513 'id': programme_id,
082c6c86
S
514 'title': title,
515 'description': description,
650cfd0c 516 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
517 'duration': duration,
518 'formats': formats,
2e3fd9ec 519 'subtitles': subtitles,
5f6a1245 520 }
10273d6e 521
522
9afa1770
S
523class BBCIE(BBCCoUkIE):
524 IE_NAME = 'bbc'
525 IE_DESC = 'BBC'
526 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 527
d12a1a47 528 _MEDIASELECTOR_URLS = [
55ebae26
S
529 # Provides HQ HLS streams but fails with geolocation in some cases when it's
530 # even not geo restricted at all
531 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
532 # Provides more formats, namely direct mp4 links, but fails on some videos with
533 # notukerror for non UK (?) users (e.g.
534 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
535 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
536 # Provides fewer formats, but works everywhere for everybody (hopefully)
537 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
538 ]
10273d6e 539
540 _TESTS = [{
6a747190 541 # article with multiple videos embedded with data-playable containing vpids
10273d6e 542 'url': 'http://www.bbc.com/news/world-europe-32668511',
543 'info_dict': {
544 'id': 'world-europe-32668511',
545 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 546 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 547 },
548 'playlist_count': 2,
a3bfddfa 549 }, {
6a747190 550 # article with multiple videos embedded with data-playable (more videos)
10273d6e 551 'url': 'http://www.bbc.com/news/business-28299555',
552 'info_dict': {
553 'id': 'business-28299555',
554 'title': 'Farnborough Airshow: Video highlights',
9afa1770 555 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 556 },
557 'playlist_count': 9,
9afa1770 558 'skip': 'Save time',
88ed52ae
S
559 }, {
560 # article with multiple videos embedded with `new SMP()`
6a747190 561 # broken
88ed52ae
S
562 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
563 'info_dict': {
564 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 565 'title': 'BUGGER',
88ed52ae
S
566 },
567 'playlist_count': 18,
a3bfddfa 568 }, {
6a747190 569 # single video embedded with data-playable containing vpid
10273d6e 570 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 571 'info_dict': {
572 'id': 'p02mprgb',
55ebae26 573 'ext': 'mp4',
10273d6e 574 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 575 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 576 'duration': 47,
9afa1770 577 'timestamp': 1427219242,
da92eeae 578 'upload_date': '20150324',
10273d6e 579 },
580 'params': {
9afa1770 581 # rtmp download
10273d6e 582 'skip_download': True,
583 }
a3bfddfa 584 }, {
6a747190
S
585 # article with single video embedded with data-playable containing XML playlist
586 # with direct video links as progressiveDownloadUrl (for now these are extracted)
587 # and playlist with f4m and m3u8 as streamingUrl
de939d89 588 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 589 'info_dict': {
9afa1770 590 'id': '150615_telabyad_kentin_cogu',
de939d89 591 'ext': 'mp4',
9afa1770 592 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
9afa1770 593 'timestamp': 1434397334,
da92eeae 594 'upload_date': '20150615',
de939d89 595 },
596 'params': {
597 'skip_download': True,
598 }
c936d8cc 599 }, {
6a747190 600 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 601 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 602 'info_dict': {
9afa1770 603 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 604 'ext': 'mp4',
9afa1770 605 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
9afa1770 606 'timestamp': 1434713142,
da92eeae 607 'upload_date': '20150619',
de939d89 608 },
609 'params': {
610 'skip_download': True,
611 }
a346b1ff
S
612 }, {
613 # single video from video playlist embedded with vxp-playlist-data JSON
614 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
615 'info_dict': {
616 'id': 'p02w6qjc',
55ebae26 617 'ext': 'mp4',
a346b1ff
S
618 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
619 'duration': 56,
0bc4ee60 620 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
621 },
622 'params': {
623 'skip_download': True,
624 }
9afa1770
S
625 }, {
626 # single video story with digitalData
627 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
628 'info_dict': {
629 'id': 'p02q6gc4',
630 'ext': 'flv',
631 'title': 'Sri Lanka’s spicy secret',
632 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
633 'timestamp': 1437674293,
634 'upload_date': '20150723',
635 },
636 'params': {
637 # rtmp download
638 'skip_download': True,
639 }
640 }, {
641 # single video story without digitalData
642 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
643 'info_dict': {
644 'id': 'p018zqqg',
55ebae26 645 'ext': 'mp4',
9afa1770
S
646 'title': 'Hyundai Santa Fe Sport: Rock star',
647 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
648 'timestamp': 1415867444,
649 'upload_date': '20141113',
9afa1770
S
650 },
651 'params': {
652 # rtmp download
653 'skip_download': True,
654 }
655 }, {
6a747190 656 # single video with playlist.sxml URL in playlist param
9afa1770
S
657 'url': 'http://www.bbc.com/sport/0/football/33653409',
658 'info_dict': {
659 'id': 'p02xycnp',
55ebae26 660 'ext': 'mp4',
9afa1770 661 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 662 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
663 'duration': 140,
664 },
665 'params': {
666 # rtmp download
667 'skip_download': True,
668 }
b5d48cb1 669 }, {
6a747190 670 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
671 'url': 'http://www.bbc.com/sport/0/football/34475836',
672 'info_dict': {
673 'id': '34475836',
450b233c 674 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 675 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
676 },
677 'playlist_count': 3,
450b233c
S
678 }, {
679 # school report article with single video
680 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
681 'info_dict': {
682 'id': '35744779',
683 'title': 'School which breaks down barriers in Jerusalem',
684 },
685 'playlist_count': 1,
9afa1770
S
686 }, {
687 # single video with playlist URL from weather section
688 'url': 'http://www.bbc.com/weather/features/33601775',
689 'only_matching': True,
690 }, {
691 # custom redirection to www.bbc.com
692 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
693 'only_matching': True,
a1cf3e38
S
694 }, {
695 # single video article embedded with data-media-vpid
696 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
697 'only_matching': True,
10273d6e 698 }]
699
9afa1770
S
700 @classmethod
701 def suitable(cls, url):
ded7511a
S
702 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
703 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
704 else super(BBCIE, cls).suitable(url))
9afa1770
S
705
706 def _extract_from_media_meta(self, media_meta, video_id):
707 # Direct links to media in media metadata (e.g.
708 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
709 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
710 source_files = media_meta.get('sourceFiles')
711 if source_files:
712 return [{
713 'url': f['url'],
714 'format_id': format_id,
715 'ext': f.get('encoding'),
716 'tbr': float_or_none(f.get('bitrate'), 1000),
717 'filesize': int_or_none(f.get('filesize')),
718 } for format_id, f in source_files.items() if f.get('url')], []
719
720 programme_id = media_meta.get('externalId')
721 if programme_id:
722 return self._download_media_selector(programme_id)
723
724 # Process playlist.sxml as legacy playlist
725 href = media_meta.get('href')
726 if href:
727 playlist = self._download_legacy_playlist_url(href)
728 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
729 return formats, subtitles
730
731 return [], []
732
baf39a1a
S
733 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
734 programme_id, title, description, duration, formats, subtitles = \
735 self._process_legacy_playlist_url(url, playlist_id)
736 self._sort_formats(formats)
737 return {
738 'id': programme_id,
739 'title': title,
740 'description': description,
741 'duration': duration,
742 'timestamp': timestamp,
743 'formats': formats,
744 'subtitles': subtitles,
745 }
746
10273d6e 747 def _real_extract(self, url):
9afa1770
S
748 playlist_id = self._match_id(url)
749
750 webpage = self._download_webpage(url, playlist_id)
751
350e02d4
YCH
752 json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
753 timestamp = json_ld_info.get('timestamp')
0e832c2c 754
350e02d4 755 playlist_title = json_ld_info.get('title')
0e832c2c
S
756 if not playlist_title:
757 playlist_title = self._og_search_title(
758 webpage, default=None) or self._html_search_regex(
759 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
760 if playlist_title:
761 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
762
763 playlist_description = json_ld_info.get(
764 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
765
766 if not timestamp:
767 timestamp = parse_iso8601(self._search_regex(
768 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
769 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 770 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 771 webpage, 'date', default=None))
9afa1770 772
78f9d843
S
773 entries = []
774
de665713
S
775 # article with multiple videos embedded with playlist.sxml (e.g.
776 # http://www.bbc.com/sport/0/football/34475836)
777 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 778 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 779 if playlists:
baf39a1a
S
780 entries = [
781 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
782 for playlist_url in playlists]
de939d89 783
78f9d843
S
784 # news article with multiple videos embedded with data-playable
785 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
786 if data_playables:
787 for _, data_playable_json in data_playables:
788 data_playable = self._parse_json(
789 unescapeHTML(data_playable_json), playlist_id, fatal=False)
790 if not data_playable:
791 continue
baf39a1a
S
792 settings = data_playable.get('settings', {})
793 if settings:
78f9d843
S
794 # data-playable with video vpid in settings.playlistObject.items (e.g.
795 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
796 playlist_object = settings.get('playlistObject', {})
797 if playlist_object:
798 items = playlist_object.get('items')
799 if items and isinstance(items, list):
78f9d843
S
800 title = playlist_object['title']
801 description = playlist_object.get('summary')
baf39a1a
S
802 duration = int_or_none(items[0].get('duration'))
803 programme_id = items[0].get('vpid')
78f9d843
S
804 formats, subtitles = self._download_media_selector(programme_id)
805 self._sort_formats(formats)
806 entries.append({
807 'id': programme_id,
808 'title': title,
809 'description': description,
810 'timestamp': timestamp,
811 'duration': duration,
812 'formats': formats,
813 'subtitles': subtitles,
814 })
815 else:
816 # data-playable without vpid but with a playlist.sxml URLs
817 # in otherSettings.playlist (e.g.
818 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
819 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
820 if playlist:
821 entries.append(self._extract_from_playlist_sxml(
822 playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
823
824 if entries:
78f9d843
S
825 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
826
827 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
828 programme_id = self._search_regex(
a1cf3e38 829 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
830 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
831 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 832 webpage, 'vpid', default=None)
dab062fb 833
9afa1770
S
834 if programme_id:
835 formats, subtitles = self._download_media_selector(programme_id)
836 self._sort_formats(formats)
837 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
838 digital_data = self._parse_json(
839 self._search_regex(
840 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
841 programme_id, fatal=False)
842 page_info = digital_data.get('page', {}).get('pageInfo', {})
843 title = page_info.get('pageName') or self._og_search_title(webpage)
844 description = page_info.get('description') or self._og_search_description(webpage)
845 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
846 return {
847 'id': programme_id,
848 'title': title,
849 'description': description,
850 'timestamp': timestamp,
851 'formats': formats,
852 'subtitles': subtitles,
853 }
a3bfddfa 854
88ed52ae
S
855 def extract_all(pattern):
856 return list(filter(None, map(
857 lambda s: self._parse_json(s, playlist_id, fatal=False),
858 re.findall(pattern, webpage))))
859
860 # Multiple video article (e.g.
861 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 862 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
863 entries = []
864 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
865 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
866 if embed_url and re.match(EMBED_URL, embed_url):
867 entries.append(embed_url)
868 entries.extend(re.findall(
869 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
870 if entries:
871 return self.playlist_result(
872 [self.url_result(entry, 'BBCCoUk') for entry in entries],
873 playlist_id, playlist_title, playlist_description)
9afa1770
S
874
875 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 876 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
877
878 if not medias:
879 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
880 media_asset = self._search_regex(
881 r'mediaAssetPage\.init\(\s*({.+?}), "/',
882 webpage, 'media asset', default=None)
883 if media_asset:
884 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
885 medias = []
886 for video in media_asset_page.get('videos', {}).values():
887 medias.extend(video.values())
888
889 if not medias:
890 # Multiple video playlist with single `now playing` entry (e.g.
891 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
892 vxp_playlist = self._parse_json(
9afa1770 893 self._search_regex(
a346b1ff
S
894 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
895 webpage, 'playlist data'),
9afa1770 896 playlist_id)
a346b1ff
S
897 playlist_medias = []
898 for item in vxp_playlist:
899 media = item.get('media')
900 if not media:
901 continue
902 playlist_medias.append(media)
903 # Download single video if found media with asset id matching the video id from URL
904 if item.get('advert', {}).get('assetId') == playlist_id:
905 medias = [media]
906 break
907 # Fallback to the whole playlist
908 if not medias:
909 medias = playlist_medias
9afa1770
S
910
911 entries = []
912 for num, media_meta in enumerate(medias, start=1):
913 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
914 if not formats:
915 continue
10273d6e 916 self._sort_formats(formats)
917
9afa1770
S
918 video_id = media_meta.get('externalId')
919 if not video_id:
920 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
921
922 title = media_meta.get('caption')
923 if not title:
924 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
925
926 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 927
9afa1770
S
928 images = []
929 for image in media_meta.get('images', {}).values():
930 images.extend(image.values())
931 if 'image' in media_meta:
932 images.append(media_meta['image'])
933
934 thumbnails = [{
935 'url': image.get('href'),
936 'width': int_or_none(image.get('width')),
937 'height': int_or_none(image.get('height')),
938 } for image in images]
939
940 entries.append({
941 'id': video_id,
10273d6e 942 'title': title,
9afa1770 943 'thumbnails': thumbnails,
10273d6e 944 'duration': duration,
9afa1770 945 'timestamp': timestamp,
10273d6e 946 'formats': formats,
947 'subtitles': subtitles,
a3bfddfa 948 })
10273d6e 949
9afa1770 950 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
951
952
953class BBCCoUkArticleIE(InfoExtractor):
5886b38d 954 _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
955 IE_NAME = 'bbc.co.uk:article'
956 IE_DESC = 'BBC articles'
957
958 _TEST = {
959 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
960 'info_dict': {
961 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
962 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
963 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
964 },
965 'playlist_count': 4,
966 'add_ie': ['BBCCoUk'],
967 }
968
969 def _real_extract(self, url):
970 playlist_id = self._match_id(url)
971
972 webpage = self._download_webpage(url, playlist_id)
973
974 title = self._og_search_title(webpage)
975 description = self._og_search_description(webpage).strip()
976
977 entries = [self.url_result(programme_url) for programme_url in re.findall(
978 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
979
980 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
981
982
983class BBCCoUkPlaylistBaseIE(InfoExtractor):
984 def _real_extract(self, url):
985 playlist_id = self._match_id(url)
986
987 webpage = self._download_webpage(url, playlist_id)
988
989 entries = [
990 self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
991 for video_id in re.findall(
992 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)]
993
994 title, description = self._extract_title_and_description(webpage)
995
996 return self.playlist_result(entries, playlist_id, title, description)
997
998
999class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1000 IE_NAME = 'bbc.co.uk:iplayer:playlist'
1001 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1002 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1003 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
1004 _TEST = {
1005 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1006 'info_dict': {
1007 'id': 'b05rcz9v',
1008 'title': 'The Disappearance',
1009 'description': 'French thriller serial about a missing teenager.',
1010 },
1011 'playlist_mincount': 6,
1012 }
1013
1014 def _extract_title_and_description(self, webpage):
1015 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1016 description = self._search_regex(
1017 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1018 webpage, 'description', fatal=False, group='value')
1019 return title, description
1020
1021
1022class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1023 IE_NAME = 'bbc.co.uk:playlist'
1024 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1025 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1026 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1027 _TESTS = [{
1028 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1029 'info_dict': {
1030 'id': 'b05rcz9v',
1031 'title': 'The Disappearance - Clips - BBC Four',
1032 'description': 'French thriller serial about a missing teenager.',
1033 },
1034 'playlist_mincount': 7,
1035 }, {
1036 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1037 'only_matching': True,
1038 }, {
1039 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1040 'only_matching': True,
1041 }, {
1042 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1043 'only_matching': True,
1044 }]
1045
1046 def _extract_title_and_description(self, webpage):
1047 title = self._og_search_title(webpage, fatal=False)
1048 description = self._og_search_description(webpage)
1049 return title, description