]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[markiza] Expect 500 status code
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
6d155707 15 js_to_json,
9afa1770
S
16 parse_duration,
17 parse_iso8601,
9fb64c04 18 try_get,
dab062fb 19 unescapeHTML,
97067db2
S
20 urlencode_postdata,
21 urljoin,
8683b4d8 22)
36e6f62c
JMF
23from ..compat import (
24 compat_etree_fromstring,
25 compat_HTTPError,
254e64a2 26 compat_urlparse,
36e6f62c 27)
082c6c86 28
d12a1a47 29
f13b1e7d 30class BBCCoUkIE(InfoExtractor):
082c6c86 31 IE_NAME = 'bbc.co.uk'
2e3fd9ec 32 IE_DESC = 'BBC iPlayer'
53647dfd 33 _ID_REGEX = r'[pbw][\da-z]{7}'
f20a11ed
S
34 _VALID_URL = r'''(?x)
35 https?://
36 (?:www\.)?bbc\.co\.uk/
37 (?:
38 programmes/(?!articles/)|
39 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 40 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
41 radio/player/|
42 events/[^/]+/play/[^/]+/
f20a11ed 43 )
ded7511a 44 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 45 ''' % _ID_REGEX
082c6c86 46
97067db2
S
47 _LOGIN_URL = 'https://account.bbc.com/signin'
48 _NETRC_MACHINE = 'bbc'
49
d12a1a47 50 _MEDIASELECTOR_URLS = [
26ccc68b
S
51 # Provides HQ HLS streams with even better quality that pc mediaset but fails
52 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 53 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 54 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
55 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
56 ]
a8b081a0 57
e6174ee9
S
58 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
59 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
60
61 _NAMESPACES = (
62 _MEDIASELECTION_NS,
63 _EMP_PLAYLIST_NS,
64 )
65
2e3fd9ec
S
66 _TESTS = [
67 {
f2d0fc68 68 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 69 'info_dict': {
f2d0fc68 70 'id': 'b039d07m',
b1ea6802 71 'ext': 'flv',
679bacf0 72 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 73 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
74 },
75 'params': {
b1ea6802 76 # rtmp download
2e3fd9ec
S
77 'skip_download': True,
78 }
082c6c86 79 },
2e3fd9ec
S
80 {
81 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
82 'info_dict': {
83 'id': 'b00yng1d',
84 'ext': 'flv',
85 'title': 'The Man in Black: Series 3: The Printed Name',
86 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
87 'duration': 1800,
88 },
89 'params': {
90 # rtmp download
91 'skip_download': True,
c7f0177f
S
92 },
93 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
94 },
95 {
96 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
97 'info_dict': {
98 'id': 'b00yng1d',
99 'ext': 'flv',
17968e44 100 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 101 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 102 'duration': 5100,
2e3fd9ec
S
103 },
104 'params': {
105 # rtmp download
106 'skip_download': True,
107 },
b1ea6802 108 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
109 },
110 {
111 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
112 'info_dict': {
113 'id': 'b03k3pb7',
114 'ext': 'flv',
115 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
116 'description': '2. Invasion',
117 'duration': 3600,
118 },
119 'params': {
120 # rtmp download
121 'skip_download': True,
122 },
b1ea6802 123 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
124 }, {
125 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
126 'info_dict': {
127 'id': 'b04v209v',
128 'ext': 'flv',
129 'title': 'Pete Tong, The Essential New Tune Special',
130 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
131 'duration': 10800,
132 },
133 'params': {
134 # rtmp download
135 'skip_download': True,
a3ef0e1c
YCH
136 },
137 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 138 }, {
5aa535c3 139 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
140 'note': 'Audio',
141 'info_dict': {
5aa535c3 142 'id': 'p022h44j',
b1ea6802 143 'ext': 'flv',
5aa535c3
S
144 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
145 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
146 'duration': 227,
c7e67594
S
147 },
148 'params': {
b1ea6802 149 # rtmp download
c7e67594
S
150 'skip_download': True,
151 }
152 }, {
153 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
154 'note': 'Video',
155 'info_dict': {
156 'id': 'p025c103',
b1ea6802 157 'ext': 'flv',
c7e67594
S
158 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
159 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
160 'duration': 226,
161 },
162 'params': {
b1ea6802 163 # rtmp download
c7e67594
S
164 'skip_download': True,
165 }
e68ae99a
S
166 }, {
167 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
168 'info_dict': {
169 'id': 'p02n76xf',
170 'ext': 'flv',
171 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
172 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
173 'duration': 3540,
174 },
175 'params': {
176 # rtmp download
177 'skip_download': True,
178 },
b1ea6802 179 'skip': 'geolocation',
25fa8d66
YCH
180 }, {
181 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
182 'info_dict': {
183 'id': 'b05zmgw1',
184 'ext': 'flv',
185 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
186 'title': 'Royal Academy Summer Exhibition',
187 'duration': 3540,
188 },
189 'params': {
190 # rtmp download
191 'skip_download': True,
192 },
b1ea6802 193 'skip': 'geolocation',
54914380
S
194 }, {
195 # iptv-all mediaset fails with geolocation however there is no geo restriction
196 # for this programme at all
5aa535c3 197 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 198 'info_dict': {
5aa535c3 199 'id': 'b06rkms3',
54914380 200 'ext': 'flv',
5aa535c3
S
201 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
202 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
203 },
204 'params': {
205 # rtmp download
206 'skip_download': True,
207 },
b1ea6802 208 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
209 }, {
210 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
211 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
212 'info_dict': {
213 'id': 'p028bfkj',
b1ea6802 214 'ext': 'flv',
1ac6e794
S
215 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
217 },
218 'params': {
b1ea6802 219 # rtmp download
1ac6e794
S
220 'skip_download': True,
221 },
31763975
S
222 }, {
223 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
224 'only_matching': True,
c7e67594
S
225 }, {
226 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
227 'only_matching': True,
0692ef86
S
228 }, {
229 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
230 'only_matching': True,
f20a11ed
S
231 }, {
232 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
233 'only_matching': True,
72d256c4
S
234 }, {
235 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
236 'only_matching': True,
53647dfd
S
237 }, {
238 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
239 'only_matching': True,
72d256c4 240 }]
2e3fd9ec 241
97eb9bd2
RA
242 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
243
97067db2
S
244 def _login(self):
245 username, password = self._get_login_info()
246 if username is None:
247 return
248
249 login_page = self._download_webpage(
250 self._LOGIN_URL, None, 'Downloading signin page')
251
252 login_form = self._hidden_inputs(login_page)
253
254 login_form.update({
255 'username': username,
256 'password': password,
257 })
258
259 post_url = urljoin(self._LOGIN_URL, self._search_regex(
260 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
261 'post url', default=self._LOGIN_URL, group='url'))
262
263 response, urlh = self._download_webpage_handle(
264 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
265 headers={'Referer': self._LOGIN_URL})
266
267 if self._LOGIN_URL in urlh.geturl():
268 error = clean_html(get_element_by_class('form-message', response))
269 if error:
270 raise ExtractorError(
271 'Unable to login: %s' % error, expected=True)
272 raise ExtractorError('Unable to log in')
273
274 def _real_initialize(self):
275 self._login()
276
d12a1a47
S
277 class MediaSelectionError(Exception):
278 def __init__(self, id):
279 self.id = id
280
2e3fd9ec
S
281 def _extract_asx_playlist(self, connection, programme_id):
282 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
283 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
284
2e3fd9ec 285 def _extract_items(self, playlist):
e6174ee9
S
286 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
287
288 def _findall_ns(self, element, xpath):
289 elements = []
290 for ns in self._NAMESPACES:
291 elements.extend(element.findall(xpath % ns))
292 return elements
2e3fd9ec
S
293
294 def _extract_medias(self, media_selection):
e6174ee9
S
295 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
296 if error is None:
297 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 298 if error is not None:
d12a1a47 299 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 300 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
301
302 def _extract_connections(self, media):
e6174ee9 303 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 304
f13b1e7d 305 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
306 subtitles = {}
307 for connection in self._extract_connections(media):
308 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
309 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
310 subtitles[lang] = [
311 {
312 'url': connection.get('href'),
313 'ext': 'ttml',
314 },
f13b1e7d 315 ]
2e3fd9ec 316 return subtitles
082c6c86 317
d12a1a47
S
318 def _raise_extractor_error(self, media_selection_error):
319 raise ExtractorError(
320 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
321 expected=True)
322
c056efa2 323 def _download_media_selector(self, programme_id):
d12a1a47
S
324 last_exception = None
325 for mediaselector_url in self._MEDIASELECTOR_URLS:
326 try:
327 return self._download_media_selector_url(
328 mediaselector_url % programme_id, programme_id)
329 except BBCCoUkIE.MediaSelectionError as e:
d781e293 330 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
331 last_exception = e
332 continue
333 self._raise_extractor_error(e)
334 self._raise_extractor_error(last_exception)
9afa1770
S
335
336 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
337 try:
338 media_selection = self._download_xml(
9afa1770 339 url, programme_id, 'Downloading media selection XML')
c056efa2 340 except ExtractorError as ee:
d781e293 341 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 342 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 343 else:
c056efa2 344 raise
9afa1770 345 return self._process_media_selector(media_selection, programme_id)
082c6c86 346
9afa1770 347 def _process_media_selector(self, media_selection, programme_id):
082c6c86 348 formats = []
2e3fd9ec 349 subtitles = None
b0af1215 350 urls = []
2e3fd9ec 351
c056efa2
S
352 for media in self._extract_medias(media_selection):
353 kind = media.get('kind')
a7e5f274
RA
354 if kind in ('video', 'audio'):
355 bitrate = int_or_none(media.get('bitrate'))
356 encoding = media.get('encoding')
357 service = media.get('service')
358 width = int_or_none(media.get('width'))
359 height = int_or_none(media.get('height'))
360 file_size = int_or_none(media.get('media_file_size'))
361 for connection in self._extract_connections(media):
b0af1215
RA
362 href = connection.get('href')
363 if href in urls:
364 continue
365 if href:
366 urls.append(href)
a7e5f274
RA
367 conn_kind = connection.get('kind')
368 protocol = connection.get('protocol')
369 supplier = connection.get('supplier')
a7e5f274
RA
370 transfer_format = connection.get('transferFormat')
371 format_id = supplier or conn_kind or protocol
372 if service:
373 format_id = '%s_%s' % (service, format_id)
374 # ASX playlist
375 if supplier == 'asx':
376 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
377 formats.append({
378 'url': ref,
379 'format_id': 'ref%s_%s' % (i, format_id),
380 })
381 elif transfer_format == 'dash':
382 formats.extend(self._extract_mpd_formats(
383 href, programme_id, mpd_id=format_id, fatal=False))
384 elif transfer_format == 'hls':
385 formats.extend(self._extract_m3u8_formats(
386 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
387 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
388 if re.search(self._USP_RE, href):
389 usp_formats = self._extract_m3u8_formats(
6b2d8c91 390 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
97eb9bd2
RA
391 programme_id, ext='mp4', entry_protocol='m3u8_native',
392 m3u8_id=format_id, fatal=False)
393 for f in usp_formats:
394 if f.get('height') and f['height'] > 720:
395 continue
396 formats.append(f)
a7e5f274
RA
397 elif transfer_format == 'hds':
398 formats.extend(self._extract_f4m_formats(
399 href, programme_id, f4m_id=format_id, fatal=False))
400 else:
f9622868 401 if not service and not supplier and bitrate:
aaa42cf0 402 format_id += '-%d' % bitrate
a7e5f274
RA
403 fmt = {
404 'format_id': format_id,
405 'filesize': file_size,
406 }
407 if kind == 'video':
408 fmt.update({
409 'width': width,
410 'height': height,
6240925b 411 'tbr': bitrate,
a7e5f274
RA
412 'vcodec': encoding,
413 })
414 else:
415 fmt.update({
416 'abr': bitrate,
417 'acodec': encoding,
418 'vcodec': 'none',
419 })
1af959ef 420 if protocol in ('http', 'https'):
a7e5f274
RA
421 # Direct link
422 fmt.update({
423 'url': href,
424 })
425 elif protocol == 'rtmp':
426 application = connection.get('application', 'ondemand')
427 auth_string = connection.get('authString')
428 identifier = connection.get('identifier')
429 server = connection.get('server')
430 fmt.update({
431 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
432 'play_path': identifier,
433 'app': '%s?%s' % (application, auth_string),
434 'page_url': 'http://www.bbc.co.uk',
435 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
436 'rtmp_live': False,
437 'ext': 'flv',
438 })
964744af
S
439 else:
440 continue
a7e5f274 441 formats.append(fmt)
c056efa2 442 elif kind == 'captions':
f13b1e7d 443 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 444 return formats, subtitles
2e3fd9ec 445
ae6986fb
S
446 def _download_playlist(self, playlist_id):
447 try:
448 playlist = self._download_json(
449 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
450 playlist_id, 'Downloading playlist JSON')
451
452 version = playlist.get('defaultAvailableVersion')
453 if version:
454 smp_config = version['smpConfig']
455 title = smp_config['title']
456 description = smp_config['summary']
457 for item in smp_config['items']:
458 kind = item['kind']
40fcba5e 459 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
460 continue
461 programme_id = item.get('vpid')
d97f5cd7 462 duration = int_or_none(item.get('duration'))
ae6986fb
S
463 formats, subtitles = self._download_media_selector(programme_id)
464 return programme_id, title, description, duration, formats, subtitles
465 except ExtractorError as ee:
f813928e 466 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
467 raise
468
469 # fallback to legacy playlist
9afa1770
S
470 return self._process_legacy_playlist(playlist_id)
471
472 def _process_legacy_playlist_url(self, url, display_id):
473 playlist = self._download_legacy_playlist_url(url, display_id)
474 return self._extract_from_legacy_playlist(playlist, display_id)
475
476 def _process_legacy_playlist(self, playlist_id):
477 return self._process_legacy_playlist_url(
478 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
479
480 def _download_legacy_playlist_url(self, url, playlist_id=None):
481 return self._download_xml(
482 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 483
9afa1770 484 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 485 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
486 if no_items is not None:
487 reason = no_items.get('reason')
488 if reason == 'preAvailability':
489 msg = 'Episode %s is not yet available' % playlist_id
490 elif reason == 'postAvailability':
491 msg = 'Episode %s is no longer available' % playlist_id
492 elif reason == 'noMedia':
493 msg = 'Episode %s is not currently available' % playlist_id
494 else:
495 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
496 raise ExtractorError(msg, expected=True)
497
498 for item in self._extract_items(playlist):
499 kind = item.get('kind')
40fcba5e 500 if kind not in ('programme', 'radioProgramme'):
ae6986fb 501 continue
e6174ee9
S
502 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
503 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 504 description = description_el.text if description_el is not None else None
9afa1770
S
505
506 def get_programme_id(item):
507 def get_from_attributes(item):
508 for p in('identifier', 'group'):
509 value = item.get(p)
510 if value and re.match(r'^[pb][\da-z]{7}$', value):
511 return value
512 get_from_attributes(item)
e6174ee9 513 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
514 if mediator is not None:
515 return get_from_attributes(mediator)
516
517 programme_id = get_programme_id(item)
d97f5cd7 518 duration = int_or_none(item.get('duration'))
e6174ee9
S
519
520 if programme_id:
521 formats, subtitles = self._download_media_selector(programme_id)
522 else:
523 formats, subtitles = self._process_media_selector(item, playlist_id)
524 programme_id = playlist_id
ae6986fb
S
525
526 return programme_id, title, description, duration, formats, subtitles
527
c056efa2
S
528 def _real_extract(self, url):
529 group_id = self._match_id(url)
530
531 webpage = self._download_webpage(url, group_id, 'Downloading video page')
532
b2ed954f
S
533 error = self._search_regex(
534 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
535 webpage, 'error', default=None)
536 if error:
537 raise ExtractorError(error, expected=True)
538
8683b4d8 539 programme_id = None
679bacf0 540 duration = None
8683b4d8
S
541
542 tviplayer = self._search_regex(
543 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
544 webpage, 'player', default=None)
545
546 if tviplayer:
547 player = self._parse_json(tviplayer, group_id).get('player', {})
548 duration = int_or_none(player.get('duration'))
549 programme_id = player.get('vpid')
550
551 if not programme_id:
552 programme_id = self._search_regex(
22d7368d 553 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 554
c056efa2 555 if programme_id:
c056efa2 556 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 557 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
558 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
559 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 560 description = self._search_regex(
a8534274
S
561 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
562 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
563 webpage, 'description', default=None)
564 if not description:
565 description = self._html_search_meta('description', webpage)
c056efa2 566 else:
ae6986fb 567 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 568
082c6c86
S
569 self._sort_formats(formats)
570
571 return {
2e3fd9ec 572 'id': programme_id,
082c6c86
S
573 'title': title,
574 'description': description,
650cfd0c 575 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
576 'duration': duration,
577 'formats': formats,
2e3fd9ec 578 'subtitles': subtitles,
5f6a1245 579 }
10273d6e 580
581
9afa1770
S
582class BBCIE(BBCCoUkIE):
583 IE_NAME = 'bbc'
584 IE_DESC = 'BBC'
585 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 586
d12a1a47 587 _MEDIASELECTOR_URLS = [
55ebae26
S
588 # Provides HQ HLS streams but fails with geolocation in some cases when it's
589 # even not geo restricted at all
590 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
591 # Provides more formats, namely direct mp4 links, but fails on some videos with
592 # notukerror for non UK (?) users (e.g.
593 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
594 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
595 # Provides fewer formats, but works everywhere for everybody (hopefully)
596 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
597 ]
10273d6e 598
599 _TESTS = [{
6a747190 600 # article with multiple videos embedded with data-playable containing vpids
10273d6e 601 'url': 'http://www.bbc.com/news/world-europe-32668511',
602 'info_dict': {
603 'id': 'world-europe-32668511',
604 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 605 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 606 },
607 'playlist_count': 2,
a3bfddfa 608 }, {
6a747190 609 # article with multiple videos embedded with data-playable (more videos)
10273d6e 610 'url': 'http://www.bbc.com/news/business-28299555',
611 'info_dict': {
612 'id': 'business-28299555',
613 'title': 'Farnborough Airshow: Video highlights',
9afa1770 614 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 615 },
616 'playlist_count': 9,
9afa1770 617 'skip': 'Save time',
88ed52ae
S
618 }, {
619 # article with multiple videos embedded with `new SMP()`
6a747190 620 # broken
88ed52ae
S
621 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
622 'info_dict': {
623 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 624 'title': 'BUGGER',
88ed52ae
S
625 },
626 'playlist_count': 18,
a3bfddfa 627 }, {
6a747190 628 # single video embedded with data-playable containing vpid
10273d6e 629 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 630 'info_dict': {
631 'id': 'p02mprgb',
55ebae26 632 'ext': 'mp4',
10273d6e 633 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 634 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 635 'duration': 47,
9afa1770 636 'timestamp': 1427219242,
da92eeae 637 'upload_date': '20150324',
10273d6e 638 },
639 'params': {
9afa1770 640 # rtmp download
10273d6e 641 'skip_download': True,
642 }
a3bfddfa 643 }, {
6a747190
S
644 # article with single video embedded with data-playable containing XML playlist
645 # with direct video links as progressiveDownloadUrl (for now these are extracted)
646 # and playlist with f4m and m3u8 as streamingUrl
de939d89 647 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 648 'info_dict': {
9afa1770 649 'id': '150615_telabyad_kentin_cogu',
de939d89 650 'ext': 'mp4',
ad152e2d 651 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 652 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 653 'timestamp': 1434397334,
da92eeae 654 'upload_date': '20150615',
de939d89 655 },
656 'params': {
657 'skip_download': True,
658 }
c936d8cc 659 }, {
6a747190 660 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 661 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 662 'info_dict': {
9afa1770 663 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 664 'ext': 'mp4',
9afa1770 665 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 666 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 667 'timestamp': 1434713142,
da92eeae 668 'upload_date': '20150619',
de939d89 669 },
670 'params': {
671 'skip_download': True,
672 }
a346b1ff
S
673 }, {
674 # single video from video playlist embedded with vxp-playlist-data JSON
675 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
676 'info_dict': {
677 'id': 'p02w6qjc',
55ebae26 678 'ext': 'mp4',
a346b1ff
S
679 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
680 'duration': 56,
0bc4ee60 681 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
682 },
683 'params': {
684 'skip_download': True,
685 }
9afa1770
S
686 }, {
687 # single video story with digitalData
688 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
689 'info_dict': {
690 'id': 'p02q6gc4',
691 'ext': 'flv',
692 'title': 'Sri Lanka’s spicy secret',
693 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
694 'timestamp': 1437674293,
695 'upload_date': '20150723',
696 },
697 'params': {
698 # rtmp download
699 'skip_download': True,
700 }
701 }, {
702 # single video story without digitalData
703 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
704 'info_dict': {
705 'id': 'p018zqqg',
55ebae26 706 'ext': 'mp4',
9afa1770
S
707 'title': 'Hyundai Santa Fe Sport: Rock star',
708 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
709 'timestamp': 1415867444,
710 'upload_date': '20141113',
9afa1770
S
711 },
712 'params': {
713 # rtmp download
714 'skip_download': True,
715 }
9fb64c04
S
716 }, {
717 # single video embedded with Morph
718 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
719 'info_dict': {
720 'id': 'p041vhd0',
721 'ext': 'mp4',
722 'title': "Nigeria v Japan - Men's First Round",
723 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
724 'duration': 7980,
725 'uploader': 'BBC Sport',
726 'uploader_id': 'bbc_sport',
727 },
728 'params': {
729 # m3u8 download
730 'skip_download': True,
9fb64c04
S
731 },
732 'skip': 'Georestricted to UK',
9afa1770 733 }, {
6a747190 734 # single video with playlist.sxml URL in playlist param
9afa1770
S
735 'url': 'http://www.bbc.com/sport/0/football/33653409',
736 'info_dict': {
737 'id': 'p02xycnp',
55ebae26 738 'ext': 'mp4',
9afa1770 739 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 740 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
741 'duration': 140,
742 },
743 'params': {
744 # rtmp download
745 'skip_download': True,
746 }
b5d48cb1 747 }, {
6a747190 748 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
749 'url': 'http://www.bbc.com/sport/0/football/34475836',
750 'info_dict': {
751 'id': '34475836',
450b233c 752 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 753 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
754 },
755 'playlist_count': 3,
450b233c
S
756 }, {
757 # school report article with single video
758 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
759 'info_dict': {
760 'id': '35744779',
761 'title': 'School which breaks down barriers in Jerusalem',
762 },
763 'playlist_count': 1,
9afa1770
S
764 }, {
765 # single video with playlist URL from weather section
766 'url': 'http://www.bbc.com/weather/features/33601775',
767 'only_matching': True,
768 }, {
769 # custom redirection to www.bbc.com
770 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
771 'only_matching': True,
a1cf3e38
S
772 }, {
773 # single video article embedded with data-media-vpid
774 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
775 'only_matching': True,
6d155707
S
776 }, {
777 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
778 'info_dict': {
779 'id': 'p06556y7',
780 'ext': 'mp4',
781 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
782 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
783 },
784 'params': {
785 'skip_download': True,
786 }
10273d6e 787 }]
788
9afa1770
S
789 @classmethod
790 def suitable(cls, url):
ded7511a
S
791 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
792 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
793 else super(BBCIE, cls).suitable(url))
9afa1770
S
794
795 def _extract_from_media_meta(self, media_meta, video_id):
796 # Direct links to media in media metadata (e.g.
797 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
798 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
799 source_files = media_meta.get('sourceFiles')
800 if source_files:
801 return [{
802 'url': f['url'],
803 'format_id': format_id,
804 'ext': f.get('encoding'),
805 'tbr': float_or_none(f.get('bitrate'), 1000),
806 'filesize': int_or_none(f.get('filesize')),
807 } for format_id, f in source_files.items() if f.get('url')], []
808
809 programme_id = media_meta.get('externalId')
810 if programme_id:
811 return self._download_media_selector(programme_id)
812
813 # Process playlist.sxml as legacy playlist
814 href = media_meta.get('href')
815 if href:
816 playlist = self._download_legacy_playlist_url(href)
817 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
818 return formats, subtitles
819
820 return [], []
821
baf39a1a
S
822 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
823 programme_id, title, description, duration, formats, subtitles = \
824 self._process_legacy_playlist_url(url, playlist_id)
825 self._sort_formats(formats)
826 return {
827 'id': programme_id,
828 'title': title,
829 'description': description,
830 'duration': duration,
831 'timestamp': timestamp,
832 'formats': formats,
833 'subtitles': subtitles,
834 }
835
10273d6e 836 def _real_extract(self, url):
9afa1770
S
837 playlist_id = self._match_id(url)
838
839 webpage = self._download_webpage(url, playlist_id)
840
522f6c06 841 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 842 timestamp = json_ld_info.get('timestamp')
0e832c2c 843
350e02d4 844 playlist_title = json_ld_info.get('title')
0e832c2c
S
845 if not playlist_title:
846 playlist_title = self._og_search_title(
847 webpage, default=None) or self._html_search_regex(
848 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
849 if playlist_title:
850 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
851
852 playlist_description = json_ld_info.get(
853 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
854
855 if not timestamp:
856 timestamp = parse_iso8601(self._search_regex(
857 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
858 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 859 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 860 webpage, 'date', default=None))
9afa1770 861
78f9d843
S
862 entries = []
863
de665713
S
864 # article with multiple videos embedded with playlist.sxml (e.g.
865 # http://www.bbc.com/sport/0/football/34475836)
866 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 867 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 868 if playlists:
baf39a1a
S
869 entries = [
870 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
871 for playlist_url in playlists]
de939d89 872
78f9d843
S
873 # news article with multiple videos embedded with data-playable
874 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
875 if data_playables:
876 for _, data_playable_json in data_playables:
877 data_playable = self._parse_json(
878 unescapeHTML(data_playable_json), playlist_id, fatal=False)
879 if not data_playable:
880 continue
baf39a1a
S
881 settings = data_playable.get('settings', {})
882 if settings:
78f9d843
S
883 # data-playable with video vpid in settings.playlistObject.items (e.g.
884 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
885 playlist_object = settings.get('playlistObject', {})
886 if playlist_object:
887 items = playlist_object.get('items')
888 if items and isinstance(items, list):
78f9d843
S
889 title = playlist_object['title']
890 description = playlist_object.get('summary')
baf39a1a
S
891 duration = int_or_none(items[0].get('duration'))
892 programme_id = items[0].get('vpid')
78f9d843
S
893 formats, subtitles = self._download_media_selector(programme_id)
894 self._sort_formats(formats)
895 entries.append({
896 'id': programme_id,
897 'title': title,
898 'description': description,
899 'timestamp': timestamp,
900 'duration': duration,
901 'formats': formats,
902 'subtitles': subtitles,
903 })
904 else:
905 # data-playable without vpid but with a playlist.sxml URLs
906 # in otherSettings.playlist (e.g.
907 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
908 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
909 if playlist:
a7e5f274
RA
910 entry = None
911 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
912 playlist_url = playlist.get('%sUrl' % key)
913 if not playlist_url:
914 continue
915 try:
a7e5f274
RA
916 info = self._extract_from_playlist_sxml(
917 playlist_url, playlist_id, timestamp)
918 if not entry:
919 entry = info
920 else:
921 entry['title'] = info['title']
922 entry['formats'].extend(info['formats'])
05087d1b
S
923 except Exception as e:
924 # Some playlist URL may fail with 500, at the same time
925 # the other one may work fine (e.g.
926 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
927 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
928 continue
929 raise
a7e5f274
RA
930 if entry:
931 self._sort_formats(entry['formats'])
932 entries.append(entry)
78f9d843
S
933
934 if entries:
78f9d843
S
935 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
936
937 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
938 programme_id = self._search_regex(
a1cf3e38 939 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
940 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
941 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 942 webpage, 'vpid', default=None)
dab062fb 943
9afa1770
S
944 if programme_id:
945 formats, subtitles = self._download_media_selector(programme_id)
946 self._sort_formats(formats)
947 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
948 digital_data = self._parse_json(
949 self._search_regex(
950 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
951 programme_id, fatal=False)
952 page_info = digital_data.get('page', {}).get('pageInfo', {})
953 title = page_info.get('pageName') or self._og_search_title(webpage)
954 description = page_info.get('description') or self._og_search_description(webpage)
955 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
956 return {
957 'id': programme_id,
958 'title': title,
959 'description': description,
960 'timestamp': timestamp,
961 'formats': formats,
962 'subtitles': subtitles,
963 }
a3bfddfa 964
9fb64c04
S
965 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
966 # There are several setPayload calls may be present but the video
967 # seems to be always related to the first one
968 morph_payload = self._parse_json(
969 self._search_regex(
970 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
971 webpage, 'morph payload', default='{}'),
972 playlist_id, fatal=False)
973 if morph_payload:
974 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
975 for component in components:
976 if not isinstance(component, dict):
977 continue
978 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
979 if not lead_media:
980 continue
981 identifiers = lead_media.get('identifiers')
982 if not identifiers or not isinstance(identifiers, dict):
983 continue
984 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
985 if not programme_id:
986 continue
987 title = lead_media.get('title') or self._og_search_title(webpage)
988 formats, subtitles = self._download_media_selector(programme_id)
989 self._sort_formats(formats)
990 description = lead_media.get('summary')
991 uploader = lead_media.get('masterBrand')
992 uploader_id = lead_media.get('mid')
993 duration = None
994 duration_d = lead_media.get('duration')
995 if isinstance(duration_d, dict):
996 duration = parse_duration(dict_get(
997 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
998 return {
999 'id': programme_id,
1000 'title': title,
1001 'description': description,
1002 'duration': duration,
1003 'uploader': uploader,
1004 'uploader_id': uploader_id,
1005 'formats': formats,
1006 'subtitles': subtitles,
1007 }
1008
6d155707
S
1009 bbc3_config = self._parse_json(
1010 self._search_regex(
1011 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1012 'bbcthree config', default='{}'),
1013 playlist_id, transform_source=js_to_json, fatal=False)
1014 if bbc3_config:
1015 bbc3_playlist = try_get(
1016 bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
1017 dict)
1018 if bbc3_playlist:
1019 playlist_title = bbc3_playlist.get('title') or playlist_title
1020 thumbnail = bbc3_playlist.get('holdingImageURL')
1021 entries = []
1022 for bbc3_item in bbc3_playlist['items']:
1023 programme_id = bbc3_item.get('versionID')
1024 if not programme_id:
1025 continue
1026 formats, subtitles = self._download_media_selector(programme_id)
1027 self._sort_formats(formats)
1028 entries.append({
1029 'id': programme_id,
1030 'title': playlist_title,
1031 'thumbnail': thumbnail,
1032 'timestamp': timestamp,
1033 'formats': formats,
1034 'subtitles': subtitles,
1035 })
1036 return self.playlist_result(
1037 entries, playlist_id, playlist_title, playlist_description)
1038
88ed52ae
S
1039 def extract_all(pattern):
1040 return list(filter(None, map(
1041 lambda s: self._parse_json(s, playlist_id, fatal=False),
1042 re.findall(pattern, webpage))))
1043
1044 # Multiple video article (e.g.
1045 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1046 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1047 entries = []
1048 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1049 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1050 if embed_url and re.match(EMBED_URL, embed_url):
1051 entries.append(embed_url)
1052 entries.extend(re.findall(
1053 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1054 if entries:
1055 return self.playlist_result(
aaa42cf0 1056 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1057 playlist_id, playlist_title, playlist_description)
9afa1770
S
1058
1059 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1060 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1061
1062 if not medias:
1063 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1064 media_asset = self._search_regex(
1065 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1066 webpage, 'media asset', default=None)
1067 if media_asset:
1068 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1069 medias = []
1070 for video in media_asset_page.get('videos', {}).values():
1071 medias.extend(video.values())
1072
1073 if not medias:
1074 # Multiple video playlist with single `now playing` entry (e.g.
1075 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1076 vxp_playlist = self._parse_json(
9afa1770 1077 self._search_regex(
a346b1ff
S
1078 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1079 webpage, 'playlist data'),
9afa1770 1080 playlist_id)
a346b1ff
S
1081 playlist_medias = []
1082 for item in vxp_playlist:
1083 media = item.get('media')
1084 if not media:
1085 continue
1086 playlist_medias.append(media)
1087 # Download single video if found media with asset id matching the video id from URL
1088 if item.get('advert', {}).get('assetId') == playlist_id:
1089 medias = [media]
1090 break
1091 # Fallback to the whole playlist
1092 if not medias:
1093 medias = playlist_medias
9afa1770
S
1094
1095 entries = []
1096 for num, media_meta in enumerate(medias, start=1):
1097 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1098 if not formats:
1099 continue
10273d6e 1100 self._sort_formats(formats)
1101
9afa1770
S
1102 video_id = media_meta.get('externalId')
1103 if not video_id:
1104 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1105
1106 title = media_meta.get('caption')
1107 if not title:
1108 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1109
1110 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1111
9afa1770
S
1112 images = []
1113 for image in media_meta.get('images', {}).values():
1114 images.extend(image.values())
1115 if 'image' in media_meta:
1116 images.append(media_meta['image'])
1117
1118 thumbnails = [{
1119 'url': image.get('href'),
1120 'width': int_or_none(image.get('width')),
1121 'height': int_or_none(image.get('height')),
1122 } for image in images]
1123
1124 entries.append({
1125 'id': video_id,
10273d6e 1126 'title': title,
9afa1770 1127 'thumbnails': thumbnails,
10273d6e 1128 'duration': duration,
9afa1770 1129 'timestamp': timestamp,
10273d6e 1130 'formats': formats,
1131 'subtitles': subtitles,
a3bfddfa 1132 })
10273d6e 1133
9afa1770 1134 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1135
1136
1137class BBCCoUkArticleIE(InfoExtractor):
92519402 1138 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1139 IE_NAME = 'bbc.co.uk:article'
1140 IE_DESC = 'BBC articles'
1141
1142 _TEST = {
1143 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1144 'info_dict': {
1145 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1146 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1147 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1148 },
1149 'playlist_count': 4,
1150 'add_ie': ['BBCCoUk'],
1151 }
1152
1153 def _real_extract(self, url):
1154 playlist_id = self._match_id(url)
1155
1156 webpage = self._download_webpage(url, playlist_id)
1157
1158 title = self._og_search_title(webpage)
1159 description = self._og_search_description(webpage).strip()
1160
1161 entries = [self.url_result(programme_url) for programme_url in re.findall(
1162 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1163
1164 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1165
1166
1167class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1168 def _entries(self, webpage, url, playlist_id):
1169 single_page = 'page' in compat_urlparse.parse_qs(
1170 compat_urlparse.urlparse(url).query)
1171 for page_num in itertools.count(2):
1172 for video_id in re.findall(
1173 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1174 yield self.url_result(
1175 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1176 if single_page:
1177 return
1178 next_page = self._search_regex(
1179 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1180 webpage, 'next page url', default=None, group='url')
1181 if not next_page:
1182 break
1183 webpage = self._download_webpage(
1184 compat_urlparse.urljoin(url, next_page), playlist_id,
1185 'Downloading page %d' % page_num, page_num)
1186
ded7511a
S
1187 def _real_extract(self, url):
1188 playlist_id = self._match_id(url)
1189
1190 webpage = self._download_webpage(url, playlist_id)
1191
ded7511a
S
1192 title, description = self._extract_title_and_description(webpage)
1193
254e64a2
S
1194 return self.playlist_result(
1195 self._entries(webpage, url, playlist_id),
1196 playlist_id, title, description)
ded7511a
S
1197
1198
1199class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1200 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1201 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1202 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1203 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1204 _TESTS = [{
ded7511a
S
1205 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1206 'info_dict': {
1207 'id': 'b05rcz9v',
1208 'title': 'The Disappearance',
1209 'description': 'French thriller serial about a missing teenager.',
1210 },
1211 'playlist_mincount': 6,
c6668e4a 1212 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1213 }, {
1214 # Available for over a year unlike 30 days for most other programmes
1215 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1216 'info_dict': {
1217 'id': 'p02tcc32',
1218 'title': 'Bohemian Icons',
1219 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1220 },
1221 'playlist_mincount': 10,
1222 }]
ded7511a
S
1223
1224 def _extract_title_and_description(self, webpage):
1225 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1226 description = self._search_regex(
1227 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1228 webpage, 'description', fatal=False, group='value')
1229 return title, description
1230
1231
1232class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1233 IE_NAME = 'bbc.co.uk:playlist'
1234 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1235 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1236 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1237 _TESTS = [{
1238 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1239 'info_dict': {
1240 'id': 'b05rcz9v',
1241 'title': 'The Disappearance - Clips - BBC Four',
1242 'description': 'French thriller serial about a missing teenager.',
1243 },
1244 'playlist_mincount': 7,
4f640f28
S
1245 }, {
1246 # multipage playlist, explicit page
1247 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1248 'info_dict': {
1249 'id': 'b00mfl7n',
1250 'title': 'Frozen Planet - Clips - BBC One',
1251 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1252 },
1253 'playlist_mincount': 24,
1254 }, {
1255 # multipage playlist, all pages
1256 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1257 'info_dict': {
1258 'id': 'b00mfl7n',
1259 'title': 'Frozen Planet - Clips - BBC One',
1260 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1261 },
1262 'playlist_mincount': 142,
ded7511a
S
1263 }, {
1264 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1265 'only_matching': True,
1266 }, {
1267 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1268 'only_matching': True,
1269 }, {
1270 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1271 'only_matching': True,
1272 }]
1273
1274 def _extract_title_and_description(self, webpage):
1275 title = self._og_search_title(webpage, fatal=False)
1276 description = self._og_search_description(webpage)
1277 return title, description