]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[compat] Introduce compat_etree_Element
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
254e64a2 4import itertools
f0228f56
S
5import re
6import xml
082c6c86 7
f13b1e7d 8from .common import InfoExtractor
8683b4d8 9from ..utils import (
97067db2 10 clean_html,
9fb64c04 11 dict_get,
8683b4d8 12 ExtractorError,
9afa1770 13 float_or_none,
97067db2 14 get_element_by_class,
8683b4d8 15 int_or_none,
6d155707 16 js_to_json,
9afa1770
S
17 parse_duration,
18 parse_iso8601,
9fb64c04 19 try_get,
dab062fb 20 unescapeHTML,
f0228f56 21 url_or_none,
97067db2
S
22 urlencode_postdata,
23 urljoin,
8683b4d8 24)
36e6f62c 25from ..compat import (
36e6f62c 26 compat_HTTPError,
254e64a2 27 compat_urlparse,
36e6f62c 28)
082c6c86 29
d12a1a47 30
f13b1e7d 31class BBCCoUkIE(InfoExtractor):
082c6c86 32 IE_NAME = 'bbc.co.uk'
2e3fd9ec 33 IE_DESC = 'BBC iPlayer'
6f356cbb 34 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
35 _VALID_URL = r'''(?x)
36 https?://
37 (?:www\.)?bbc\.co\.uk/
38 (?:
39 programmes/(?!articles/)|
40 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 41 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
42 radio/player/|
43 events/[^/]+/play/[^/]+/
f20a11ed 44 )
ded7511a 45 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 46 ''' % _ID_REGEX
082c6c86 47
97067db2
S
48 _LOGIN_URL = 'https://account.bbc.com/signin'
49 _NETRC_MACHINE = 'bbc'
50
d12a1a47 51 _MEDIASELECTOR_URLS = [
26ccc68b
S
52 # Provides HQ HLS streams with even better quality that pc mediaset but fails
53 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 54 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 55 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
56 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
57 ]
a8b081a0 58
e6174ee9
S
59 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
60 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
61
62 _NAMESPACES = (
63 _MEDIASELECTION_NS,
64 _EMP_PLAYLIST_NS,
65 )
66
2e3fd9ec
S
67 _TESTS = [
68 {
f2d0fc68 69 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 70 'info_dict': {
f2d0fc68 71 'id': 'b039d07m',
b1ea6802 72 'ext': 'flv',
679bacf0 73 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 74 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
75 },
76 'params': {
b1ea6802 77 # rtmp download
2e3fd9ec
S
78 'skip_download': True,
79 }
082c6c86 80 },
2e3fd9ec
S
81 {
82 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
83 'info_dict': {
84 'id': 'b00yng1d',
85 'ext': 'flv',
86 'title': 'The Man in Black: Series 3: The Printed Name',
87 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
88 'duration': 1800,
89 },
90 'params': {
91 # rtmp download
92 'skip_download': True,
c7f0177f
S
93 },
94 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
95 },
96 {
97 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
98 'info_dict': {
99 'id': 'b00yng1d',
100 'ext': 'flv',
17968e44 101 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 102 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 103 'duration': 5100,
2e3fd9ec
S
104 },
105 'params': {
106 # rtmp download
107 'skip_download': True,
108 },
b1ea6802 109 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
110 },
111 {
112 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
113 'info_dict': {
114 'id': 'b03k3pb7',
115 'ext': 'flv',
116 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
117 'description': '2. Invasion',
118 'duration': 3600,
119 },
120 'params': {
121 # rtmp download
122 'skip_download': True,
123 },
b1ea6802 124 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
125 }, {
126 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
127 'info_dict': {
128 'id': 'b04v209v',
129 'ext': 'flv',
130 'title': 'Pete Tong, The Essential New Tune Special',
131 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
132 'duration': 10800,
133 },
134 'params': {
135 # rtmp download
136 'skip_download': True,
a3ef0e1c
YCH
137 },
138 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 139 }, {
5aa535c3 140 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
141 'note': 'Audio',
142 'info_dict': {
5aa535c3 143 'id': 'p022h44j',
b1ea6802 144 'ext': 'flv',
5aa535c3
S
145 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
146 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
147 'duration': 227,
c7e67594
S
148 },
149 'params': {
b1ea6802 150 # rtmp download
c7e67594
S
151 'skip_download': True,
152 }
153 }, {
154 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
155 'note': 'Video',
156 'info_dict': {
157 'id': 'p025c103',
b1ea6802 158 'ext': 'flv',
c7e67594
S
159 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
160 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
161 'duration': 226,
162 },
163 'params': {
b1ea6802 164 # rtmp download
c7e67594
S
165 'skip_download': True,
166 }
e68ae99a
S
167 }, {
168 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
169 'info_dict': {
170 'id': 'p02n76xf',
171 'ext': 'flv',
172 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
173 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
174 'duration': 3540,
175 },
176 'params': {
177 # rtmp download
178 'skip_download': True,
179 },
b1ea6802 180 'skip': 'geolocation',
25fa8d66
YCH
181 }, {
182 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
183 'info_dict': {
184 'id': 'b05zmgw1',
185 'ext': 'flv',
186 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
187 'title': 'Royal Academy Summer Exhibition',
188 'duration': 3540,
189 },
190 'params': {
191 # rtmp download
192 'skip_download': True,
193 },
b1ea6802 194 'skip': 'geolocation',
54914380
S
195 }, {
196 # iptv-all mediaset fails with geolocation however there is no geo restriction
197 # for this programme at all
5aa535c3 198 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 199 'info_dict': {
5aa535c3 200 'id': 'b06rkms3',
54914380 201 'ext': 'flv',
5aa535c3
S
202 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
203 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
204 },
205 'params': {
206 # rtmp download
207 'skip_download': True,
208 },
b1ea6802 209 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
210 }, {
211 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
212 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
213 'info_dict': {
214 'id': 'p028bfkj',
b1ea6802 215 'ext': 'flv',
1ac6e794
S
216 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
217 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
218 },
219 'params': {
b1ea6802 220 # rtmp download
1ac6e794
S
221 'skip_download': True,
222 },
31763975
S
223 }, {
224 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
225 'only_matching': True,
c7e67594
S
226 }, {
227 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
228 'only_matching': True,
0692ef86
S
229 }, {
230 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
231 'only_matching': True,
f20a11ed
S
232 }, {
233 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
234 'only_matching': True,
72d256c4
S
235 }, {
236 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
237 'only_matching': True,
53647dfd
S
238 }, {
239 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
240 'only_matching': True,
6f356cbb
S
241 }, {
242 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
243 'only_matching': True,
244 }, {
245 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
246 'only_matching': True,
72d256c4 247 }]
2e3fd9ec 248
97eb9bd2
RA
249 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
250
97067db2
S
251 def _login(self):
252 username, password = self._get_login_info()
253 if username is None:
254 return
255
256 login_page = self._download_webpage(
257 self._LOGIN_URL, None, 'Downloading signin page')
258
259 login_form = self._hidden_inputs(login_page)
260
261 login_form.update({
262 'username': username,
263 'password': password,
264 })
265
266 post_url = urljoin(self._LOGIN_URL, self._search_regex(
267 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
268 'post url', default=self._LOGIN_URL, group='url'))
269
270 response, urlh = self._download_webpage_handle(
271 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
272 headers={'Referer': self._LOGIN_URL})
273
274 if self._LOGIN_URL in urlh.geturl():
275 error = clean_html(get_element_by_class('form-message', response))
276 if error:
277 raise ExtractorError(
278 'Unable to login: %s' % error, expected=True)
279 raise ExtractorError('Unable to log in')
280
281 def _real_initialize(self):
282 self._login()
283
d12a1a47
S
284 class MediaSelectionError(Exception):
285 def __init__(self, id):
286 self.id = id
287
2e3fd9ec
S
288 def _extract_asx_playlist(self, connection, programme_id):
289 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
290 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
291
2e3fd9ec 292 def _extract_items(self, playlist):
e6174ee9
S
293 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
294
295 def _findall_ns(self, element, xpath):
296 elements = []
297 for ns in self._NAMESPACES:
298 elements.extend(element.findall(xpath % ns))
299 return elements
2e3fd9ec
S
300
301 def _extract_medias(self, media_selection):
e6174ee9
S
302 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
303 if error is None:
304 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 305 if error is not None:
d12a1a47 306 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 307 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
308
309 def _extract_connections(self, media):
e6174ee9 310 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 311
f13b1e7d 312 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
313 subtitles = {}
314 for connection in self._extract_connections(media):
f0228f56
S
315 cc_url = url_or_none(connection.get('href'))
316 if not cc_url:
317 continue
318 captions = self._download_xml(
319 cc_url, programme_id, 'Downloading captions', fatal=False)
320 if not isinstance(captions, xml.etree.ElementTree.Element):
321 continue
2e3fd9ec 322 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
323 subtitles[lang] = [
324 {
325 'url': connection.get('href'),
326 'ext': 'ttml',
327 },
f13b1e7d 328 ]
2e3fd9ec 329 return subtitles
082c6c86 330
d12a1a47
S
331 def _raise_extractor_error(self, media_selection_error):
332 raise ExtractorError(
333 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
334 expected=True)
335
c056efa2 336 def _download_media_selector(self, programme_id):
d12a1a47
S
337 last_exception = None
338 for mediaselector_url in self._MEDIASELECTOR_URLS:
339 try:
340 return self._download_media_selector_url(
341 mediaselector_url % programme_id, programme_id)
342 except BBCCoUkIE.MediaSelectionError as e:
d781e293 343 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
344 last_exception = e
345 continue
346 self._raise_extractor_error(e)
347 self._raise_extractor_error(last_exception)
9afa1770
S
348
349 def _download_media_selector_url(self, url, programme_id=None):
9283d4ea
S
350 media_selection = self._download_xml(
351 url, programme_id, 'Downloading media selection XML',
352 expected_status=(403, 404))
9afa1770 353 return self._process_media_selector(media_selection, programme_id)
082c6c86 354
9afa1770 355 def _process_media_selector(self, media_selection, programme_id):
082c6c86 356 formats = []
2e3fd9ec 357 subtitles = None
b0af1215 358 urls = []
2e3fd9ec 359
c056efa2
S
360 for media in self._extract_medias(media_selection):
361 kind = media.get('kind')
a7e5f274
RA
362 if kind in ('video', 'audio'):
363 bitrate = int_or_none(media.get('bitrate'))
364 encoding = media.get('encoding')
365 service = media.get('service')
366 width = int_or_none(media.get('width'))
367 height = int_or_none(media.get('height'))
368 file_size = int_or_none(media.get('media_file_size'))
369 for connection in self._extract_connections(media):
b0af1215
RA
370 href = connection.get('href')
371 if href in urls:
372 continue
373 if href:
374 urls.append(href)
a7e5f274
RA
375 conn_kind = connection.get('kind')
376 protocol = connection.get('protocol')
377 supplier = connection.get('supplier')
a7e5f274
RA
378 transfer_format = connection.get('transferFormat')
379 format_id = supplier or conn_kind or protocol
380 if service:
381 format_id = '%s_%s' % (service, format_id)
382 # ASX playlist
383 if supplier == 'asx':
384 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
385 formats.append({
386 'url': ref,
387 'format_id': 'ref%s_%s' % (i, format_id),
388 })
389 elif transfer_format == 'dash':
390 formats.extend(self._extract_mpd_formats(
391 href, programme_id, mpd_id=format_id, fatal=False))
392 elif transfer_format == 'hls':
393 formats.extend(self._extract_m3u8_formats(
394 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
395 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
396 if re.search(self._USP_RE, href):
397 usp_formats = self._extract_m3u8_formats(
6b2d8c91 398 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
97eb9bd2
RA
399 programme_id, ext='mp4', entry_protocol='m3u8_native',
400 m3u8_id=format_id, fatal=False)
401 for f in usp_formats:
402 if f.get('height') and f['height'] > 720:
403 continue
404 formats.append(f)
a7e5f274
RA
405 elif transfer_format == 'hds':
406 formats.extend(self._extract_f4m_formats(
407 href, programme_id, f4m_id=format_id, fatal=False))
408 else:
f9622868 409 if not service and not supplier and bitrate:
aaa42cf0 410 format_id += '-%d' % bitrate
a7e5f274
RA
411 fmt = {
412 'format_id': format_id,
413 'filesize': file_size,
414 }
415 if kind == 'video':
416 fmt.update({
417 'width': width,
418 'height': height,
6240925b 419 'tbr': bitrate,
a7e5f274
RA
420 'vcodec': encoding,
421 })
422 else:
423 fmt.update({
424 'abr': bitrate,
425 'acodec': encoding,
426 'vcodec': 'none',
427 })
1af959ef 428 if protocol in ('http', 'https'):
a7e5f274
RA
429 # Direct link
430 fmt.update({
431 'url': href,
432 })
433 elif protocol == 'rtmp':
434 application = connection.get('application', 'ondemand')
435 auth_string = connection.get('authString')
436 identifier = connection.get('identifier')
437 server = connection.get('server')
438 fmt.update({
439 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
440 'play_path': identifier,
441 'app': '%s?%s' % (application, auth_string),
442 'page_url': 'http://www.bbc.co.uk',
443 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
444 'rtmp_live': False,
445 'ext': 'flv',
446 })
964744af
S
447 else:
448 continue
a7e5f274 449 formats.append(fmt)
c056efa2 450 elif kind == 'captions':
f13b1e7d 451 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 452 return formats, subtitles
2e3fd9ec 453
ae6986fb
S
454 def _download_playlist(self, playlist_id):
455 try:
456 playlist = self._download_json(
457 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
458 playlist_id, 'Downloading playlist JSON')
459
460 version = playlist.get('defaultAvailableVersion')
461 if version:
462 smp_config = version['smpConfig']
463 title = smp_config['title']
464 description = smp_config['summary']
465 for item in smp_config['items']:
466 kind = item['kind']
40fcba5e 467 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
468 continue
469 programme_id = item.get('vpid')
d97f5cd7 470 duration = int_or_none(item.get('duration'))
ae6986fb
S
471 formats, subtitles = self._download_media_selector(programme_id)
472 return programme_id, title, description, duration, formats, subtitles
473 except ExtractorError as ee:
f813928e 474 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
475 raise
476
477 # fallback to legacy playlist
9afa1770
S
478 return self._process_legacy_playlist(playlist_id)
479
480 def _process_legacy_playlist_url(self, url, display_id):
481 playlist = self._download_legacy_playlist_url(url, display_id)
482 return self._extract_from_legacy_playlist(playlist, display_id)
483
484 def _process_legacy_playlist(self, playlist_id):
485 return self._process_legacy_playlist_url(
486 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
487
488 def _download_legacy_playlist_url(self, url, playlist_id=None):
489 return self._download_xml(
490 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 491
9afa1770 492 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 493 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
494 if no_items is not None:
495 reason = no_items.get('reason')
496 if reason == 'preAvailability':
497 msg = 'Episode %s is not yet available' % playlist_id
498 elif reason == 'postAvailability':
499 msg = 'Episode %s is no longer available' % playlist_id
500 elif reason == 'noMedia':
501 msg = 'Episode %s is not currently available' % playlist_id
502 else:
503 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
504 raise ExtractorError(msg, expected=True)
505
506 for item in self._extract_items(playlist):
507 kind = item.get('kind')
40fcba5e 508 if kind not in ('programme', 'radioProgramme'):
ae6986fb 509 continue
e6174ee9
S
510 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
511 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 512 description = description_el.text if description_el is not None else None
9afa1770
S
513
514 def get_programme_id(item):
515 def get_from_attributes(item):
516 for p in('identifier', 'group'):
517 value = item.get(p)
518 if value and re.match(r'^[pb][\da-z]{7}$', value):
519 return value
520 get_from_attributes(item)
e6174ee9 521 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
522 if mediator is not None:
523 return get_from_attributes(mediator)
524
525 programme_id = get_programme_id(item)
d97f5cd7 526 duration = int_or_none(item.get('duration'))
e6174ee9
S
527
528 if programme_id:
529 formats, subtitles = self._download_media_selector(programme_id)
530 else:
531 formats, subtitles = self._process_media_selector(item, playlist_id)
532 programme_id = playlist_id
ae6986fb
S
533
534 return programme_id, title, description, duration, formats, subtitles
535
c056efa2
S
536 def _real_extract(self, url):
537 group_id = self._match_id(url)
538
539 webpage = self._download_webpage(url, group_id, 'Downloading video page')
540
b2ed954f
S
541 error = self._search_regex(
542 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
543 webpage, 'error', default=None)
544 if error:
545 raise ExtractorError(error, expected=True)
546
8683b4d8 547 programme_id = None
679bacf0 548 duration = None
8683b4d8
S
549
550 tviplayer = self._search_regex(
551 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
552 webpage, 'player', default=None)
553
554 if tviplayer:
555 player = self._parse_json(tviplayer, group_id).get('player', {})
556 duration = int_or_none(player.get('duration'))
557 programme_id = player.get('vpid')
558
559 if not programme_id:
560 programme_id = self._search_regex(
22d7368d 561 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 562
c056efa2 563 if programme_id:
c056efa2 564 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 565 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
566 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
567 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 568 description = self._search_regex(
a8534274
S
569 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
570 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
571 webpage, 'description', default=None)
572 if not description:
573 description = self._html_search_meta('description', webpage)
c056efa2 574 else:
ae6986fb 575 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 576
082c6c86
S
577 self._sort_formats(formats)
578
579 return {
2e3fd9ec 580 'id': programme_id,
082c6c86
S
581 'title': title,
582 'description': description,
650cfd0c 583 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
584 'duration': duration,
585 'formats': formats,
2e3fd9ec 586 'subtitles': subtitles,
5f6a1245 587 }
10273d6e 588
589
9afa1770
S
590class BBCIE(BBCCoUkIE):
591 IE_NAME = 'bbc'
592 IE_DESC = 'BBC'
593 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 594
d12a1a47 595 _MEDIASELECTOR_URLS = [
55ebae26
S
596 # Provides HQ HLS streams but fails with geolocation in some cases when it's
597 # even not geo restricted at all
598 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
599 # Provides more formats, namely direct mp4 links, but fails on some videos with
600 # notukerror for non UK (?) users (e.g.
601 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
602 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
603 # Provides fewer formats, but works everywhere for everybody (hopefully)
604 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
605 ]
10273d6e 606
607 _TESTS = [{
6a747190 608 # article with multiple videos embedded with data-playable containing vpids
10273d6e 609 'url': 'http://www.bbc.com/news/world-europe-32668511',
610 'info_dict': {
611 'id': 'world-europe-32668511',
612 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 613 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 614 },
615 'playlist_count': 2,
a3bfddfa 616 }, {
6a747190 617 # article with multiple videos embedded with data-playable (more videos)
10273d6e 618 'url': 'http://www.bbc.com/news/business-28299555',
619 'info_dict': {
620 'id': 'business-28299555',
621 'title': 'Farnborough Airshow: Video highlights',
9afa1770 622 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 623 },
624 'playlist_count': 9,
9afa1770 625 'skip': 'Save time',
88ed52ae
S
626 }, {
627 # article with multiple videos embedded with `new SMP()`
6a747190 628 # broken
88ed52ae
S
629 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
630 'info_dict': {
631 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 632 'title': 'BUGGER',
88ed52ae
S
633 },
634 'playlist_count': 18,
a3bfddfa 635 }, {
6a747190 636 # single video embedded with data-playable containing vpid
10273d6e 637 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 638 'info_dict': {
639 'id': 'p02mprgb',
55ebae26 640 'ext': 'mp4',
10273d6e 641 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 642 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 643 'duration': 47,
9afa1770 644 'timestamp': 1427219242,
da92eeae 645 'upload_date': '20150324',
10273d6e 646 },
647 'params': {
9afa1770 648 # rtmp download
10273d6e 649 'skip_download': True,
650 }
a3bfddfa 651 }, {
6a747190
S
652 # article with single video embedded with data-playable containing XML playlist
653 # with direct video links as progressiveDownloadUrl (for now these are extracted)
654 # and playlist with f4m and m3u8 as streamingUrl
de939d89 655 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 656 'info_dict': {
9afa1770 657 'id': '150615_telabyad_kentin_cogu',
de939d89 658 'ext': 'mp4',
ad152e2d 659 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 660 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 661 'timestamp': 1434397334,
da92eeae 662 'upload_date': '20150615',
de939d89 663 },
664 'params': {
665 'skip_download': True,
666 }
c936d8cc 667 }, {
6a747190 668 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 669 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 670 'info_dict': {
9afa1770 671 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 672 'ext': 'mp4',
9afa1770 673 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 674 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 675 'timestamp': 1434713142,
da92eeae 676 'upload_date': '20150619',
de939d89 677 },
678 'params': {
679 'skip_download': True,
680 }
a346b1ff
S
681 }, {
682 # single video from video playlist embedded with vxp-playlist-data JSON
683 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
684 'info_dict': {
685 'id': 'p02w6qjc',
55ebae26 686 'ext': 'mp4',
a346b1ff
S
687 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
688 'duration': 56,
0bc4ee60 689 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
690 },
691 'params': {
692 'skip_download': True,
693 }
9afa1770
S
694 }, {
695 # single video story with digitalData
696 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
697 'info_dict': {
698 'id': 'p02q6gc4',
699 'ext': 'flv',
700 'title': 'Sri Lanka’s spicy secret',
701 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
702 'timestamp': 1437674293,
703 'upload_date': '20150723',
704 },
705 'params': {
706 # rtmp download
707 'skip_download': True,
708 }
709 }, {
710 # single video story without digitalData
711 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
712 'info_dict': {
713 'id': 'p018zqqg',
55ebae26 714 'ext': 'mp4',
9afa1770
S
715 'title': 'Hyundai Santa Fe Sport: Rock star',
716 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
717 'timestamp': 1415867444,
718 'upload_date': '20141113',
9afa1770
S
719 },
720 'params': {
721 # rtmp download
722 'skip_download': True,
723 }
9fb64c04
S
724 }, {
725 # single video embedded with Morph
726 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
727 'info_dict': {
728 'id': 'p041vhd0',
729 'ext': 'mp4',
730 'title': "Nigeria v Japan - Men's First Round",
731 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
732 'duration': 7980,
733 'uploader': 'BBC Sport',
734 'uploader_id': 'bbc_sport',
735 },
736 'params': {
737 # m3u8 download
738 'skip_download': True,
9fb64c04
S
739 },
740 'skip': 'Georestricted to UK',
9afa1770 741 }, {
6a747190 742 # single video with playlist.sxml URL in playlist param
9afa1770
S
743 'url': 'http://www.bbc.com/sport/0/football/33653409',
744 'info_dict': {
745 'id': 'p02xycnp',
55ebae26 746 'ext': 'mp4',
9afa1770 747 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 748 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
749 'duration': 140,
750 },
751 'params': {
752 # rtmp download
753 'skip_download': True,
754 }
b5d48cb1 755 }, {
6a747190 756 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
757 'url': 'http://www.bbc.com/sport/0/football/34475836',
758 'info_dict': {
759 'id': '34475836',
450b233c 760 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 761 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
762 },
763 'playlist_count': 3,
450b233c
S
764 }, {
765 # school report article with single video
766 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
767 'info_dict': {
768 'id': '35744779',
769 'title': 'School which breaks down barriers in Jerusalem',
770 },
771 'playlist_count': 1,
9afa1770
S
772 }, {
773 # single video with playlist URL from weather section
774 'url': 'http://www.bbc.com/weather/features/33601775',
775 'only_matching': True,
776 }, {
777 # custom redirection to www.bbc.com
778 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
779 'only_matching': True,
a1cf3e38
S
780 }, {
781 # single video article embedded with data-media-vpid
782 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
783 'only_matching': True,
6d155707
S
784 }, {
785 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
786 'info_dict': {
787 'id': 'p06556y7',
788 'ext': 'mp4',
789 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
790 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
791 },
792 'params': {
793 'skip_download': True,
794 }
b96b4be4
RA
795 }, {
796 # window.__PRELOADED_STATE__
797 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
798 'info_dict': {
799 'id': 'b0b9z4vz',
800 'ext': 'mp4',
801 'title': 'Prom 6: An American in Paris and Turangalila',
802 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
803 'uploader': 'Radio 3',
804 'uploader_id': 'bbc_radio_three',
805 },
373941c5
S
806 }, {
807 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
808 'info_dict': {
809 'id': 'p06w9tws',
810 'ext': 'mp4',
811 'title': 'md5:2fabf12a726603193a2879a055f72514',
812 'description': 'Learn English words and phrases from this story',
813 },
814 'add_ie': [BBCCoUkIE.ie_key()],
10273d6e 815 }]
816
9afa1770
S
817 @classmethod
818 def suitable(cls, url):
ded7511a
S
819 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
820 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
821 else super(BBCIE, cls).suitable(url))
9afa1770
S
822
823 def _extract_from_media_meta(self, media_meta, video_id):
824 # Direct links to media in media metadata (e.g.
825 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
826 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
827 source_files = media_meta.get('sourceFiles')
828 if source_files:
829 return [{
830 'url': f['url'],
831 'format_id': format_id,
832 'ext': f.get('encoding'),
833 'tbr': float_or_none(f.get('bitrate'), 1000),
834 'filesize': int_or_none(f.get('filesize')),
835 } for format_id, f in source_files.items() if f.get('url')], []
836
837 programme_id = media_meta.get('externalId')
838 if programme_id:
839 return self._download_media_selector(programme_id)
840
841 # Process playlist.sxml as legacy playlist
842 href = media_meta.get('href')
843 if href:
844 playlist = self._download_legacy_playlist_url(href)
845 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
846 return formats, subtitles
847
848 return [], []
849
baf39a1a
S
850 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
851 programme_id, title, description, duration, formats, subtitles = \
852 self._process_legacy_playlist_url(url, playlist_id)
853 self._sort_formats(formats)
854 return {
855 'id': programme_id,
856 'title': title,
857 'description': description,
858 'duration': duration,
859 'timestamp': timestamp,
860 'formats': formats,
861 'subtitles': subtitles,
862 }
863
10273d6e 864 def _real_extract(self, url):
9afa1770
S
865 playlist_id = self._match_id(url)
866
867 webpage = self._download_webpage(url, playlist_id)
868
522f6c06 869 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 870 timestamp = json_ld_info.get('timestamp')
0e832c2c 871
350e02d4 872 playlist_title = json_ld_info.get('title')
0e832c2c
S
873 if not playlist_title:
874 playlist_title = self._og_search_title(
875 webpage, default=None) or self._html_search_regex(
876 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
877 if playlist_title:
878 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
879
880 playlist_description = json_ld_info.get(
881 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
882
883 if not timestamp:
884 timestamp = parse_iso8601(self._search_regex(
885 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
886 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 887 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 888 webpage, 'date', default=None))
9afa1770 889
78f9d843
S
890 entries = []
891
de665713
S
892 # article with multiple videos embedded with playlist.sxml (e.g.
893 # http://www.bbc.com/sport/0/football/34475836)
894 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 895 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 896 if playlists:
baf39a1a
S
897 entries = [
898 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
899 for playlist_url in playlists]
de939d89 900
78f9d843
S
901 # news article with multiple videos embedded with data-playable
902 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
903 if data_playables:
904 for _, data_playable_json in data_playables:
905 data_playable = self._parse_json(
906 unescapeHTML(data_playable_json), playlist_id, fatal=False)
907 if not data_playable:
908 continue
baf39a1a
S
909 settings = data_playable.get('settings', {})
910 if settings:
78f9d843
S
911 # data-playable with video vpid in settings.playlistObject.items (e.g.
912 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
913 playlist_object = settings.get('playlistObject', {})
914 if playlist_object:
915 items = playlist_object.get('items')
916 if items and isinstance(items, list):
78f9d843
S
917 title = playlist_object['title']
918 description = playlist_object.get('summary')
baf39a1a
S
919 duration = int_or_none(items[0].get('duration'))
920 programme_id = items[0].get('vpid')
78f9d843
S
921 formats, subtitles = self._download_media_selector(programme_id)
922 self._sort_formats(formats)
923 entries.append({
924 'id': programme_id,
925 'title': title,
926 'description': description,
927 'timestamp': timestamp,
928 'duration': duration,
929 'formats': formats,
930 'subtitles': subtitles,
931 })
932 else:
933 # data-playable without vpid but with a playlist.sxml URLs
934 # in otherSettings.playlist (e.g.
935 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
936 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
937 if playlist:
a7e5f274
RA
938 entry = None
939 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
940 playlist_url = playlist.get('%sUrl' % key)
941 if not playlist_url:
942 continue
943 try:
a7e5f274
RA
944 info = self._extract_from_playlist_sxml(
945 playlist_url, playlist_id, timestamp)
946 if not entry:
947 entry = info
948 else:
949 entry['title'] = info['title']
950 entry['formats'].extend(info['formats'])
05087d1b
S
951 except Exception as e:
952 # Some playlist URL may fail with 500, at the same time
953 # the other one may work fine (e.g.
954 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
955 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
956 continue
957 raise
a7e5f274
RA
958 if entry:
959 self._sort_formats(entry['formats'])
960 entries.append(entry)
78f9d843
S
961
962 if entries:
78f9d843
S
963 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
964
373941c5
S
965 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
966 group_id = self._search_regex(
967 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
968 webpage, 'group id', default=None)
969 if playlist_id:
970 return self.url_result(
971 'https://www.bbc.co.uk/programmes/%s' % group_id,
972 ie=BBCCoUkIE.ie_key())
973
78f9d843
S
974 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
975 programme_id = self._search_regex(
a1cf3e38 976 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
977 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
978 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 979 webpage, 'vpid', default=None)
dab062fb 980
9afa1770
S
981 if programme_id:
982 formats, subtitles = self._download_media_selector(programme_id)
983 self._sort_formats(formats)
984 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
985 digital_data = self._parse_json(
986 self._search_regex(
987 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
988 programme_id, fatal=False)
989 page_info = digital_data.get('page', {}).get('pageInfo', {})
990 title = page_info.get('pageName') or self._og_search_title(webpage)
991 description = page_info.get('description') or self._og_search_description(webpage)
992 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
993 return {
994 'id': programme_id,
995 'title': title,
996 'description': description,
997 'timestamp': timestamp,
998 'formats': formats,
999 'subtitles': subtitles,
1000 }
a3bfddfa 1001
9fb64c04
S
1002 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1003 # There are several setPayload calls may be present but the video
1004 # seems to be always related to the first one
1005 morph_payload = self._parse_json(
1006 self._search_regex(
1007 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1008 webpage, 'morph payload', default='{}'),
1009 playlist_id, fatal=False)
1010 if morph_payload:
1011 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1012 for component in components:
1013 if not isinstance(component, dict):
1014 continue
1015 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1016 if not lead_media:
1017 continue
1018 identifiers = lead_media.get('identifiers')
1019 if not identifiers or not isinstance(identifiers, dict):
1020 continue
1021 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1022 if not programme_id:
1023 continue
1024 title = lead_media.get('title') or self._og_search_title(webpage)
1025 formats, subtitles = self._download_media_selector(programme_id)
1026 self._sort_formats(formats)
1027 description = lead_media.get('summary')
1028 uploader = lead_media.get('masterBrand')
1029 uploader_id = lead_media.get('mid')
1030 duration = None
1031 duration_d = lead_media.get('duration')
1032 if isinstance(duration_d, dict):
1033 duration = parse_duration(dict_get(
1034 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1035 return {
1036 'id': programme_id,
1037 'title': title,
1038 'description': description,
1039 'duration': duration,
1040 'uploader': uploader,
1041 'uploader_id': uploader_id,
1042 'formats': formats,
1043 'subtitles': subtitles,
1044 }
1045
b96b4be4
RA
1046 preload_state = self._parse_json(self._search_regex(
1047 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1048 'preload state', default='{}'), playlist_id, fatal=False)
1049 if preload_state:
1050 current_programme = preload_state.get('programmes', {}).get('current') or {}
1051 programme_id = current_programme.get('id')
1052 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1053 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1054 formats, subtitles = self._download_media_selector(programme_id)
1055 self._sort_formats(formats)
1056 synopses = current_programme.get('synopses') or {}
1057 network = current_programme.get('network') or {}
1058 duration = int_or_none(
1059 current_programme.get('duration', {}).get('value'))
1060 thumbnail = None
1061 image_url = current_programme.get('image_url')
1062 if image_url:
1063 thumbnail = image_url.replace('{recipe}', '1920x1920')
1064 return {
1065 'id': programme_id,
1066 'title': title,
1067 'description': dict_get(synopses, ('long', 'medium', 'short')),
1068 'thumbnail': thumbnail,
1069 'duration': duration,
1070 'uploader': network.get('short_title'),
1071 'uploader_id': network.get('id'),
1072 'formats': formats,
1073 'subtitles': subtitles,
1074 }
1075
6d155707
S
1076 bbc3_config = self._parse_json(
1077 self._search_regex(
1078 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1079 'bbcthree config', default='{}'),
1080 playlist_id, transform_source=js_to_json, fatal=False)
1081 if bbc3_config:
1082 bbc3_playlist = try_get(
1083 bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
1084 dict)
1085 if bbc3_playlist:
1086 playlist_title = bbc3_playlist.get('title') or playlist_title
1087 thumbnail = bbc3_playlist.get('holdingImageURL')
1088 entries = []
1089 for bbc3_item in bbc3_playlist['items']:
1090 programme_id = bbc3_item.get('versionID')
1091 if not programme_id:
1092 continue
1093 formats, subtitles = self._download_media_selector(programme_id)
1094 self._sort_formats(formats)
1095 entries.append({
1096 'id': programme_id,
1097 'title': playlist_title,
1098 'thumbnail': thumbnail,
1099 'timestamp': timestamp,
1100 'formats': formats,
1101 'subtitles': subtitles,
1102 })
1103 return self.playlist_result(
1104 entries, playlist_id, playlist_title, playlist_description)
1105
88ed52ae
S
1106 def extract_all(pattern):
1107 return list(filter(None, map(
1108 lambda s: self._parse_json(s, playlist_id, fatal=False),
1109 re.findall(pattern, webpage))))
1110
1111 # Multiple video article (e.g.
1112 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1113 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1114 entries = []
1115 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1116 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1117 if embed_url and re.match(EMBED_URL, embed_url):
1118 entries.append(embed_url)
1119 entries.extend(re.findall(
1120 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1121 if entries:
1122 return self.playlist_result(
aaa42cf0 1123 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1124 playlist_id, playlist_title, playlist_description)
9afa1770
S
1125
1126 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1127 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1128
1129 if not medias:
1130 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1131 media_asset = self._search_regex(
1132 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1133 webpage, 'media asset', default=None)
1134 if media_asset:
1135 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1136 medias = []
1137 for video in media_asset_page.get('videos', {}).values():
1138 medias.extend(video.values())
1139
1140 if not medias:
1141 # Multiple video playlist with single `now playing` entry (e.g.
1142 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1143 vxp_playlist = self._parse_json(
9afa1770 1144 self._search_regex(
a346b1ff
S
1145 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1146 webpage, 'playlist data'),
9afa1770 1147 playlist_id)
a346b1ff
S
1148 playlist_medias = []
1149 for item in vxp_playlist:
1150 media = item.get('media')
1151 if not media:
1152 continue
1153 playlist_medias.append(media)
1154 # Download single video if found media with asset id matching the video id from URL
1155 if item.get('advert', {}).get('assetId') == playlist_id:
1156 medias = [media]
1157 break
1158 # Fallback to the whole playlist
1159 if not medias:
1160 medias = playlist_medias
9afa1770
S
1161
1162 entries = []
1163 for num, media_meta in enumerate(medias, start=1):
1164 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1165 if not formats:
1166 continue
10273d6e 1167 self._sort_formats(formats)
1168
9afa1770
S
1169 video_id = media_meta.get('externalId')
1170 if not video_id:
1171 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1172
1173 title = media_meta.get('caption')
1174 if not title:
1175 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1176
1177 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1178
9afa1770
S
1179 images = []
1180 for image in media_meta.get('images', {}).values():
1181 images.extend(image.values())
1182 if 'image' in media_meta:
1183 images.append(media_meta['image'])
1184
1185 thumbnails = [{
1186 'url': image.get('href'),
1187 'width': int_or_none(image.get('width')),
1188 'height': int_or_none(image.get('height')),
1189 } for image in images]
1190
1191 entries.append({
1192 'id': video_id,
10273d6e 1193 'title': title,
9afa1770 1194 'thumbnails': thumbnails,
10273d6e 1195 'duration': duration,
9afa1770 1196 'timestamp': timestamp,
10273d6e 1197 'formats': formats,
1198 'subtitles': subtitles,
a3bfddfa 1199 })
10273d6e 1200
9afa1770 1201 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1202
1203
1204class BBCCoUkArticleIE(InfoExtractor):
92519402 1205 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1206 IE_NAME = 'bbc.co.uk:article'
1207 IE_DESC = 'BBC articles'
1208
1209 _TEST = {
1210 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1211 'info_dict': {
1212 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1213 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1214 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1215 },
1216 'playlist_count': 4,
1217 'add_ie': ['BBCCoUk'],
1218 }
1219
1220 def _real_extract(self, url):
1221 playlist_id = self._match_id(url)
1222
1223 webpage = self._download_webpage(url, playlist_id)
1224
1225 title = self._og_search_title(webpage)
1226 description = self._og_search_description(webpage).strip()
1227
1228 entries = [self.url_result(programme_url) for programme_url in re.findall(
1229 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1230
1231 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1232
1233
1234class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1235 def _entries(self, webpage, url, playlist_id):
1236 single_page = 'page' in compat_urlparse.parse_qs(
1237 compat_urlparse.urlparse(url).query)
1238 for page_num in itertools.count(2):
1239 for video_id in re.findall(
1240 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1241 yield self.url_result(
1242 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1243 if single_page:
1244 return
1245 next_page = self._search_regex(
1246 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1247 webpage, 'next page url', default=None, group='url')
1248 if not next_page:
1249 break
1250 webpage = self._download_webpage(
1251 compat_urlparse.urljoin(url, next_page), playlist_id,
1252 'Downloading page %d' % page_num, page_num)
1253
ded7511a
S
1254 def _real_extract(self, url):
1255 playlist_id = self._match_id(url)
1256
1257 webpage = self._download_webpage(url, playlist_id)
1258
ded7511a
S
1259 title, description = self._extract_title_and_description(webpage)
1260
254e64a2
S
1261 return self.playlist_result(
1262 self._entries(webpage, url, playlist_id),
1263 playlist_id, title, description)
ded7511a
S
1264
1265
1266class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1267 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1268 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1269 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1270 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1271 _TESTS = [{
ded7511a
S
1272 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1273 'info_dict': {
1274 'id': 'b05rcz9v',
1275 'title': 'The Disappearance',
1276 'description': 'French thriller serial about a missing teenager.',
1277 },
1278 'playlist_mincount': 6,
c6668e4a 1279 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1280 }, {
1281 # Available for over a year unlike 30 days for most other programmes
1282 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1283 'info_dict': {
1284 'id': 'p02tcc32',
1285 'title': 'Bohemian Icons',
1286 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1287 },
1288 'playlist_mincount': 10,
1289 }]
ded7511a
S
1290
1291 def _extract_title_and_description(self, webpage):
1292 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1293 description = self._search_regex(
1294 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1295 webpage, 'description', fatal=False, group='value')
1296 return title, description
1297
1298
1299class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1300 IE_NAME = 'bbc.co.uk:playlist'
1301 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1302 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1303 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1304 _TESTS = [{
1305 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1306 'info_dict': {
1307 'id': 'b05rcz9v',
1308 'title': 'The Disappearance - Clips - BBC Four',
1309 'description': 'French thriller serial about a missing teenager.',
1310 },
1311 'playlist_mincount': 7,
4f640f28
S
1312 }, {
1313 # multipage playlist, explicit page
1314 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1315 'info_dict': {
1316 'id': 'b00mfl7n',
1317 'title': 'Frozen Planet - Clips - BBC One',
1318 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1319 },
1320 'playlist_mincount': 24,
1321 }, {
1322 # multipage playlist, all pages
1323 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1324 'info_dict': {
1325 'id': 'b00mfl7n',
1326 'title': 'Frozen Planet - Clips - BBC One',
1327 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1328 },
1329 'playlist_mincount': 142,
ded7511a
S
1330 }, {
1331 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1332 'only_matching': True,
1333 }, {
1334 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1335 'only_matching': True,
1336 }, {
1337 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1338 'only_matching': True,
1339 }]
1340
1341 def _extract_title_and_description(self, webpage):
1342 title = self._og_search_title(webpage, fatal=False)
1343 description = self._og_search_description(webpage)
1344 return title, description