]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[bbccouk] Capture and output error message (closes #13518)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
9afa1770
S
15 parse_duration,
16 parse_iso8601,
9fb64c04 17 try_get,
dab062fb 18 unescapeHTML,
97067db2
S
19 urlencode_postdata,
20 urljoin,
8683b4d8 21)
36e6f62c
JMF
22from ..compat import (
23 compat_etree_fromstring,
24 compat_HTTPError,
254e64a2 25 compat_urlparse,
36e6f62c 26)
082c6c86 27
d12a1a47 28
f13b1e7d 29class BBCCoUkIE(InfoExtractor):
082c6c86 30 IE_NAME = 'bbc.co.uk'
2e3fd9ec 31 IE_DESC = 'BBC iPlayer'
22d7368d 32 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:www\.)?bbc\.co\.uk/
36 (?:
37 programmes/(?!articles/)|
38 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
39 music/clips[/#]|
40 radio/player/
41 )
ded7511a 42 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 43 ''' % _ID_REGEX
082c6c86 44
97067db2
S
45 _LOGIN_URL = 'https://account.bbc.com/signin'
46 _NETRC_MACHINE = 'bbc'
47
d12a1a47 48 _MEDIASELECTOR_URLS = [
26ccc68b
S
49 # Provides HQ HLS streams with even better quality that pc mediaset but fails
50 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 51 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 52 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
53 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
54 ]
a8b081a0 55
e6174ee9
S
56 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
57 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
58
59 _NAMESPACES = (
60 _MEDIASELECTION_NS,
61 _EMP_PLAYLIST_NS,
62 )
63
2e3fd9ec
S
64 _TESTS = [
65 {
f2d0fc68 66 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 67 'info_dict': {
f2d0fc68 68 'id': 'b039d07m',
b1ea6802 69 'ext': 'flv',
679bacf0 70 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 71 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
72 },
73 'params': {
b1ea6802 74 # rtmp download
2e3fd9ec
S
75 'skip_download': True,
76 }
082c6c86 77 },
2e3fd9ec
S
78 {
79 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
80 'info_dict': {
81 'id': 'b00yng1d',
82 'ext': 'flv',
83 'title': 'The Man in Black: Series 3: The Printed Name',
84 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
85 'duration': 1800,
86 },
87 'params': {
88 # rtmp download
89 'skip_download': True,
c7f0177f
S
90 },
91 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
92 },
93 {
94 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
95 'info_dict': {
96 'id': 'b00yng1d',
97 'ext': 'flv',
17968e44 98 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 99 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 100 'duration': 5100,
2e3fd9ec
S
101 },
102 'params': {
103 # rtmp download
104 'skip_download': True,
105 },
b1ea6802 106 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
107 },
108 {
109 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
110 'info_dict': {
111 'id': 'b03k3pb7',
112 'ext': 'flv',
113 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
114 'description': '2. Invasion',
115 'duration': 3600,
116 },
117 'params': {
118 # rtmp download
119 'skip_download': True,
120 },
b1ea6802 121 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
122 }, {
123 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
124 'info_dict': {
125 'id': 'b04v209v',
126 'ext': 'flv',
127 'title': 'Pete Tong, The Essential New Tune Special',
128 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
129 'duration': 10800,
130 },
131 'params': {
132 # rtmp download
133 'skip_download': True,
a3ef0e1c
YCH
134 },
135 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 136 }, {
5aa535c3 137 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
138 'note': 'Audio',
139 'info_dict': {
5aa535c3 140 'id': 'p022h44j',
b1ea6802 141 'ext': 'flv',
5aa535c3
S
142 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
143 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
144 'duration': 227,
c7e67594
S
145 },
146 'params': {
b1ea6802 147 # rtmp download
c7e67594
S
148 'skip_download': True,
149 }
150 }, {
151 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
152 'note': 'Video',
153 'info_dict': {
154 'id': 'p025c103',
b1ea6802 155 'ext': 'flv',
c7e67594
S
156 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
157 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
158 'duration': 226,
159 },
160 'params': {
b1ea6802 161 # rtmp download
c7e67594
S
162 'skip_download': True,
163 }
e68ae99a
S
164 }, {
165 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
166 'info_dict': {
167 'id': 'p02n76xf',
168 'ext': 'flv',
169 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
170 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
171 'duration': 3540,
172 },
173 'params': {
174 # rtmp download
175 'skip_download': True,
176 },
b1ea6802 177 'skip': 'geolocation',
25fa8d66
YCH
178 }, {
179 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
180 'info_dict': {
181 'id': 'b05zmgw1',
182 'ext': 'flv',
183 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
184 'title': 'Royal Academy Summer Exhibition',
185 'duration': 3540,
186 },
187 'params': {
188 # rtmp download
189 'skip_download': True,
190 },
b1ea6802 191 'skip': 'geolocation',
54914380
S
192 }, {
193 # iptv-all mediaset fails with geolocation however there is no geo restriction
194 # for this programme at all
5aa535c3 195 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 196 'info_dict': {
5aa535c3 197 'id': 'b06rkms3',
54914380 198 'ext': 'flv',
5aa535c3
S
199 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
200 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
201 },
202 'params': {
203 # rtmp download
204 'skip_download': True,
205 },
b1ea6802 206 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
207 }, {
208 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
209 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
210 'info_dict': {
211 'id': 'p028bfkj',
b1ea6802 212 'ext': 'flv',
1ac6e794
S
213 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
214 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 },
216 'params': {
b1ea6802 217 # rtmp download
1ac6e794
S
218 'skip_download': True,
219 },
31763975
S
220 }, {
221 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
222 'only_matching': True,
c7e67594
S
223 }, {
224 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
225 'only_matching': True,
0692ef86
S
226 }, {
227 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
228 'only_matching': True,
f20a11ed
S
229 }, {
230 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
231 'only_matching': True,
ae6986fb 232 }
2e3fd9ec
S
233 ]
234
97eb9bd2
RA
235 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
236
97067db2
S
237 def _login(self):
238 username, password = self._get_login_info()
239 if username is None:
240 return
241
242 login_page = self._download_webpage(
243 self._LOGIN_URL, None, 'Downloading signin page')
244
245 login_form = self._hidden_inputs(login_page)
246
247 login_form.update({
248 'username': username,
249 'password': password,
250 })
251
252 post_url = urljoin(self._LOGIN_URL, self._search_regex(
253 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
254 'post url', default=self._LOGIN_URL, group='url'))
255
256 response, urlh = self._download_webpage_handle(
257 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
258 headers={'Referer': self._LOGIN_URL})
259
260 if self._LOGIN_URL in urlh.geturl():
261 error = clean_html(get_element_by_class('form-message', response))
262 if error:
263 raise ExtractorError(
264 'Unable to login: %s' % error, expected=True)
265 raise ExtractorError('Unable to log in')
266
267 def _real_initialize(self):
268 self._login()
269
d12a1a47
S
270 class MediaSelectionError(Exception):
271 def __init__(self, id):
272 self.id = id
273
2e3fd9ec
S
274 def _extract_asx_playlist(self, connection, programme_id):
275 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
276 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
277
2e3fd9ec 278 def _extract_items(self, playlist):
e6174ee9
S
279 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
280
281 def _findall_ns(self, element, xpath):
282 elements = []
283 for ns in self._NAMESPACES:
284 elements.extend(element.findall(xpath % ns))
285 return elements
2e3fd9ec
S
286
287 def _extract_medias(self, media_selection):
e6174ee9
S
288 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
289 if error is None:
290 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 291 if error is not None:
d12a1a47 292 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 293 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
294
295 def _extract_connections(self, media):
e6174ee9 296 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 297
f13b1e7d 298 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
299 subtitles = {}
300 for connection in self._extract_connections(media):
301 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
302 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
303 subtitles[lang] = [
304 {
305 'url': connection.get('href'),
306 'ext': 'ttml',
307 },
f13b1e7d 308 ]
2e3fd9ec 309 return subtitles
082c6c86 310
d12a1a47
S
311 def _raise_extractor_error(self, media_selection_error):
312 raise ExtractorError(
313 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
314 expected=True)
315
c056efa2 316 def _download_media_selector(self, programme_id):
d12a1a47
S
317 last_exception = None
318 for mediaselector_url in self._MEDIASELECTOR_URLS:
319 try:
320 return self._download_media_selector_url(
321 mediaselector_url % programme_id, programme_id)
322 except BBCCoUkIE.MediaSelectionError as e:
d781e293 323 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
324 last_exception = e
325 continue
326 self._raise_extractor_error(e)
327 self._raise_extractor_error(last_exception)
9afa1770
S
328
329 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
330 try:
331 media_selection = self._download_xml(
9afa1770 332 url, programme_id, 'Downloading media selection XML')
c056efa2 333 except ExtractorError as ee:
d781e293 334 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 335 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 336 else:
c056efa2 337 raise
9afa1770 338 return self._process_media_selector(media_selection, programme_id)
082c6c86 339
9afa1770 340 def _process_media_selector(self, media_selection, programme_id):
082c6c86 341 formats = []
2e3fd9ec 342 subtitles = None
b0af1215 343 urls = []
2e3fd9ec 344
c056efa2
S
345 for media in self._extract_medias(media_selection):
346 kind = media.get('kind')
a7e5f274
RA
347 if kind in ('video', 'audio'):
348 bitrate = int_or_none(media.get('bitrate'))
349 encoding = media.get('encoding')
350 service = media.get('service')
351 width = int_or_none(media.get('width'))
352 height = int_or_none(media.get('height'))
353 file_size = int_or_none(media.get('media_file_size'))
354 for connection in self._extract_connections(media):
b0af1215
RA
355 href = connection.get('href')
356 if href in urls:
357 continue
358 if href:
359 urls.append(href)
a7e5f274
RA
360 conn_kind = connection.get('kind')
361 protocol = connection.get('protocol')
362 supplier = connection.get('supplier')
a7e5f274
RA
363 transfer_format = connection.get('transferFormat')
364 format_id = supplier or conn_kind or protocol
365 if service:
366 format_id = '%s_%s' % (service, format_id)
367 # ASX playlist
368 if supplier == 'asx':
369 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
370 formats.append({
371 'url': ref,
372 'format_id': 'ref%s_%s' % (i, format_id),
373 })
374 elif transfer_format == 'dash':
375 formats.extend(self._extract_mpd_formats(
376 href, programme_id, mpd_id=format_id, fatal=False))
377 elif transfer_format == 'hls':
378 formats.extend(self._extract_m3u8_formats(
379 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
380 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
381 if re.search(self._USP_RE, href):
382 usp_formats = self._extract_m3u8_formats(
383 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
384 programme_id, ext='mp4', entry_protocol='m3u8_native',
385 m3u8_id=format_id, fatal=False)
386 for f in usp_formats:
387 if f.get('height') and f['height'] > 720:
388 continue
389 formats.append(f)
a7e5f274
RA
390 elif transfer_format == 'hds':
391 formats.extend(self._extract_f4m_formats(
392 href, programme_id, f4m_id=format_id, fatal=False))
393 else:
f9622868 394 if not service and not supplier and bitrate:
aaa42cf0 395 format_id += '-%d' % bitrate
a7e5f274
RA
396 fmt = {
397 'format_id': format_id,
398 'filesize': file_size,
399 }
400 if kind == 'video':
401 fmt.update({
402 'width': width,
403 'height': height,
6240925b 404 'tbr': bitrate,
a7e5f274
RA
405 'vcodec': encoding,
406 })
407 else:
408 fmt.update({
409 'abr': bitrate,
410 'acodec': encoding,
411 'vcodec': 'none',
412 })
1af959ef 413 if protocol in ('http', 'https'):
a7e5f274
RA
414 # Direct link
415 fmt.update({
416 'url': href,
417 })
418 elif protocol == 'rtmp':
419 application = connection.get('application', 'ondemand')
420 auth_string = connection.get('authString')
421 identifier = connection.get('identifier')
422 server = connection.get('server')
423 fmt.update({
424 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
425 'play_path': identifier,
426 'app': '%s?%s' % (application, auth_string),
427 'page_url': 'http://www.bbc.co.uk',
428 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
429 'rtmp_live': False,
430 'ext': 'flv',
431 })
964744af
S
432 else:
433 continue
a7e5f274 434 formats.append(fmt)
c056efa2 435 elif kind == 'captions':
f13b1e7d 436 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 437 return formats, subtitles
2e3fd9ec 438
ae6986fb
S
439 def _download_playlist(self, playlist_id):
440 try:
441 playlist = self._download_json(
442 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
443 playlist_id, 'Downloading playlist JSON')
444
445 version = playlist.get('defaultAvailableVersion')
446 if version:
447 smp_config = version['smpConfig']
448 title = smp_config['title']
449 description = smp_config['summary']
450 for item in smp_config['items']:
451 kind = item['kind']
40fcba5e 452 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
453 continue
454 programme_id = item.get('vpid')
d97f5cd7 455 duration = int_or_none(item.get('duration'))
ae6986fb
S
456 formats, subtitles = self._download_media_selector(programme_id)
457 return programme_id, title, description, duration, formats, subtitles
458 except ExtractorError as ee:
f813928e 459 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
460 raise
461
462 # fallback to legacy playlist
9afa1770
S
463 return self._process_legacy_playlist(playlist_id)
464
465 def _process_legacy_playlist_url(self, url, display_id):
466 playlist = self._download_legacy_playlist_url(url, display_id)
467 return self._extract_from_legacy_playlist(playlist, display_id)
468
469 def _process_legacy_playlist(self, playlist_id):
470 return self._process_legacy_playlist_url(
471 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
472
473 def _download_legacy_playlist_url(self, url, playlist_id=None):
474 return self._download_xml(
475 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 476
9afa1770 477 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 478 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
479 if no_items is not None:
480 reason = no_items.get('reason')
481 if reason == 'preAvailability':
482 msg = 'Episode %s is not yet available' % playlist_id
483 elif reason == 'postAvailability':
484 msg = 'Episode %s is no longer available' % playlist_id
485 elif reason == 'noMedia':
486 msg = 'Episode %s is not currently available' % playlist_id
487 else:
488 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
489 raise ExtractorError(msg, expected=True)
490
491 for item in self._extract_items(playlist):
492 kind = item.get('kind')
40fcba5e 493 if kind not in ('programme', 'radioProgramme'):
ae6986fb 494 continue
e6174ee9
S
495 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
496 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 497 description = description_el.text if description_el is not None else None
9afa1770
S
498
499 def get_programme_id(item):
500 def get_from_attributes(item):
501 for p in('identifier', 'group'):
502 value = item.get(p)
503 if value and re.match(r'^[pb][\da-z]{7}$', value):
504 return value
505 get_from_attributes(item)
e6174ee9 506 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
507 if mediator is not None:
508 return get_from_attributes(mediator)
509
510 programme_id = get_programme_id(item)
d97f5cd7 511 duration = int_or_none(item.get('duration'))
e6174ee9
S
512
513 if programme_id:
514 formats, subtitles = self._download_media_selector(programme_id)
515 else:
516 formats, subtitles = self._process_media_selector(item, playlist_id)
517 programme_id = playlist_id
ae6986fb
S
518
519 return programme_id, title, description, duration, formats, subtitles
520
c056efa2
S
521 def _real_extract(self, url):
522 group_id = self._match_id(url)
523
524 webpage = self._download_webpage(url, group_id, 'Downloading video page')
525
b2ed954f
S
526 error = self._search_regex(
527 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
528 webpage, 'error', default=None)
529 if error:
530 raise ExtractorError(error, expected=True)
531
8683b4d8 532 programme_id = None
679bacf0 533 duration = None
8683b4d8
S
534
535 tviplayer = self._search_regex(
536 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
537 webpage, 'player', default=None)
538
539 if tviplayer:
540 player = self._parse_json(tviplayer, group_id).get('player', {})
541 duration = int_or_none(player.get('duration'))
542 programme_id = player.get('vpid')
543
544 if not programme_id:
545 programme_id = self._search_regex(
22d7368d 546 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 547
c056efa2 548 if programme_id:
c056efa2 549 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 550 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
551 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
552 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 553 description = self._search_regex(
a8534274
S
554 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
555 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
556 webpage, 'description', default=None)
557 if not description:
558 description = self._html_search_meta('description', webpage)
c056efa2 559 else:
ae6986fb 560 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 561
082c6c86
S
562 self._sort_formats(formats)
563
564 return {
2e3fd9ec 565 'id': programme_id,
082c6c86
S
566 'title': title,
567 'description': description,
650cfd0c 568 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
569 'duration': duration,
570 'formats': formats,
2e3fd9ec 571 'subtitles': subtitles,
5f6a1245 572 }
10273d6e 573
574
9afa1770
S
575class BBCIE(BBCCoUkIE):
576 IE_NAME = 'bbc'
577 IE_DESC = 'BBC'
578 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 579
d12a1a47 580 _MEDIASELECTOR_URLS = [
55ebae26
S
581 # Provides HQ HLS streams but fails with geolocation in some cases when it's
582 # even not geo restricted at all
583 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
584 # Provides more formats, namely direct mp4 links, but fails on some videos with
585 # notukerror for non UK (?) users (e.g.
586 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
587 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
588 # Provides fewer formats, but works everywhere for everybody (hopefully)
589 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
590 ]
10273d6e 591
592 _TESTS = [{
6a747190 593 # article with multiple videos embedded with data-playable containing vpids
10273d6e 594 'url': 'http://www.bbc.com/news/world-europe-32668511',
595 'info_dict': {
596 'id': 'world-europe-32668511',
597 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 598 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 599 },
600 'playlist_count': 2,
a3bfddfa 601 }, {
6a747190 602 # article with multiple videos embedded with data-playable (more videos)
10273d6e 603 'url': 'http://www.bbc.com/news/business-28299555',
604 'info_dict': {
605 'id': 'business-28299555',
606 'title': 'Farnborough Airshow: Video highlights',
9afa1770 607 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 608 },
609 'playlist_count': 9,
9afa1770 610 'skip': 'Save time',
88ed52ae
S
611 }, {
612 # article with multiple videos embedded with `new SMP()`
6a747190 613 # broken
88ed52ae
S
614 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
615 'info_dict': {
616 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 617 'title': 'BUGGER',
88ed52ae
S
618 },
619 'playlist_count': 18,
a3bfddfa 620 }, {
6a747190 621 # single video embedded with data-playable containing vpid
10273d6e 622 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 623 'info_dict': {
624 'id': 'p02mprgb',
55ebae26 625 'ext': 'mp4',
10273d6e 626 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 627 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 628 'duration': 47,
9afa1770 629 'timestamp': 1427219242,
da92eeae 630 'upload_date': '20150324',
10273d6e 631 },
632 'params': {
9afa1770 633 # rtmp download
10273d6e 634 'skip_download': True,
635 }
a3bfddfa 636 }, {
6a747190
S
637 # article with single video embedded with data-playable containing XML playlist
638 # with direct video links as progressiveDownloadUrl (for now these are extracted)
639 # and playlist with f4m and m3u8 as streamingUrl
de939d89 640 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 641 'info_dict': {
9afa1770 642 'id': '150615_telabyad_kentin_cogu',
de939d89 643 'ext': 'mp4',
ad152e2d 644 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 645 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 646 'timestamp': 1434397334,
da92eeae 647 'upload_date': '20150615',
de939d89 648 },
649 'params': {
650 'skip_download': True,
651 }
c936d8cc 652 }, {
6a747190 653 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 654 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 655 'info_dict': {
9afa1770 656 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 657 'ext': 'mp4',
9afa1770 658 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 659 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 660 'timestamp': 1434713142,
da92eeae 661 'upload_date': '20150619',
de939d89 662 },
663 'params': {
664 'skip_download': True,
665 }
a346b1ff
S
666 }, {
667 # single video from video playlist embedded with vxp-playlist-data JSON
668 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
669 'info_dict': {
670 'id': 'p02w6qjc',
55ebae26 671 'ext': 'mp4',
a346b1ff
S
672 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
673 'duration': 56,
0bc4ee60 674 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
675 },
676 'params': {
677 'skip_download': True,
678 }
9afa1770
S
679 }, {
680 # single video story with digitalData
681 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
682 'info_dict': {
683 'id': 'p02q6gc4',
684 'ext': 'flv',
685 'title': 'Sri Lanka’s spicy secret',
686 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
687 'timestamp': 1437674293,
688 'upload_date': '20150723',
689 },
690 'params': {
691 # rtmp download
692 'skip_download': True,
693 }
694 }, {
695 # single video story without digitalData
696 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
697 'info_dict': {
698 'id': 'p018zqqg',
55ebae26 699 'ext': 'mp4',
9afa1770
S
700 'title': 'Hyundai Santa Fe Sport: Rock star',
701 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
702 'timestamp': 1415867444,
703 'upload_date': '20141113',
9afa1770
S
704 },
705 'params': {
706 # rtmp download
707 'skip_download': True,
708 }
9fb64c04
S
709 }, {
710 # single video embedded with Morph
711 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
712 'info_dict': {
713 'id': 'p041vhd0',
714 'ext': 'mp4',
715 'title': "Nigeria v Japan - Men's First Round",
716 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
717 'duration': 7980,
718 'uploader': 'BBC Sport',
719 'uploader_id': 'bbc_sport',
720 },
721 'params': {
722 # m3u8 download
723 'skip_download': True,
9fb64c04
S
724 },
725 'skip': 'Georestricted to UK',
9afa1770 726 }, {
6a747190 727 # single video with playlist.sxml URL in playlist param
9afa1770
S
728 'url': 'http://www.bbc.com/sport/0/football/33653409',
729 'info_dict': {
730 'id': 'p02xycnp',
55ebae26 731 'ext': 'mp4',
9afa1770 732 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 733 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
734 'duration': 140,
735 },
736 'params': {
737 # rtmp download
738 'skip_download': True,
739 }
b5d48cb1 740 }, {
6a747190 741 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
742 'url': 'http://www.bbc.com/sport/0/football/34475836',
743 'info_dict': {
744 'id': '34475836',
450b233c 745 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 746 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
747 },
748 'playlist_count': 3,
450b233c
S
749 }, {
750 # school report article with single video
751 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
752 'info_dict': {
753 'id': '35744779',
754 'title': 'School which breaks down barriers in Jerusalem',
755 },
756 'playlist_count': 1,
9afa1770
S
757 }, {
758 # single video with playlist URL from weather section
759 'url': 'http://www.bbc.com/weather/features/33601775',
760 'only_matching': True,
761 }, {
762 # custom redirection to www.bbc.com
763 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
764 'only_matching': True,
a1cf3e38
S
765 }, {
766 # single video article embedded with data-media-vpid
767 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
768 'only_matching': True,
10273d6e 769 }]
770
9afa1770
S
771 @classmethod
772 def suitable(cls, url):
ded7511a
S
773 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
774 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
775 else super(BBCIE, cls).suitable(url))
9afa1770
S
776
777 def _extract_from_media_meta(self, media_meta, video_id):
778 # Direct links to media in media metadata (e.g.
779 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
780 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
781 source_files = media_meta.get('sourceFiles')
782 if source_files:
783 return [{
784 'url': f['url'],
785 'format_id': format_id,
786 'ext': f.get('encoding'),
787 'tbr': float_or_none(f.get('bitrate'), 1000),
788 'filesize': int_or_none(f.get('filesize')),
789 } for format_id, f in source_files.items() if f.get('url')], []
790
791 programme_id = media_meta.get('externalId')
792 if programme_id:
793 return self._download_media_selector(programme_id)
794
795 # Process playlist.sxml as legacy playlist
796 href = media_meta.get('href')
797 if href:
798 playlist = self._download_legacy_playlist_url(href)
799 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
800 return formats, subtitles
801
802 return [], []
803
baf39a1a
S
804 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
805 programme_id, title, description, duration, formats, subtitles = \
806 self._process_legacy_playlist_url(url, playlist_id)
807 self._sort_formats(formats)
808 return {
809 'id': programme_id,
810 'title': title,
811 'description': description,
812 'duration': duration,
813 'timestamp': timestamp,
814 'formats': formats,
815 'subtitles': subtitles,
816 }
817
10273d6e 818 def _real_extract(self, url):
9afa1770
S
819 playlist_id = self._match_id(url)
820
821 webpage = self._download_webpage(url, playlist_id)
822
522f6c06 823 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 824 timestamp = json_ld_info.get('timestamp')
0e832c2c 825
350e02d4 826 playlist_title = json_ld_info.get('title')
0e832c2c
S
827 if not playlist_title:
828 playlist_title = self._og_search_title(
829 webpage, default=None) or self._html_search_regex(
830 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
831 if playlist_title:
832 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
833
834 playlist_description = json_ld_info.get(
835 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
836
837 if not timestamp:
838 timestamp = parse_iso8601(self._search_regex(
839 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
840 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 841 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 842 webpage, 'date', default=None))
9afa1770 843
78f9d843
S
844 entries = []
845
de665713
S
846 # article with multiple videos embedded with playlist.sxml (e.g.
847 # http://www.bbc.com/sport/0/football/34475836)
848 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 849 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 850 if playlists:
baf39a1a
S
851 entries = [
852 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
853 for playlist_url in playlists]
de939d89 854
78f9d843
S
855 # news article with multiple videos embedded with data-playable
856 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
857 if data_playables:
858 for _, data_playable_json in data_playables:
859 data_playable = self._parse_json(
860 unescapeHTML(data_playable_json), playlist_id, fatal=False)
861 if not data_playable:
862 continue
baf39a1a
S
863 settings = data_playable.get('settings', {})
864 if settings:
78f9d843
S
865 # data-playable with video vpid in settings.playlistObject.items (e.g.
866 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
867 playlist_object = settings.get('playlistObject', {})
868 if playlist_object:
869 items = playlist_object.get('items')
870 if items and isinstance(items, list):
78f9d843
S
871 title = playlist_object['title']
872 description = playlist_object.get('summary')
baf39a1a
S
873 duration = int_or_none(items[0].get('duration'))
874 programme_id = items[0].get('vpid')
78f9d843
S
875 formats, subtitles = self._download_media_selector(programme_id)
876 self._sort_formats(formats)
877 entries.append({
878 'id': programme_id,
879 'title': title,
880 'description': description,
881 'timestamp': timestamp,
882 'duration': duration,
883 'formats': formats,
884 'subtitles': subtitles,
885 })
886 else:
887 # data-playable without vpid but with a playlist.sxml URLs
888 # in otherSettings.playlist (e.g.
889 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
890 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
891 if playlist:
a7e5f274
RA
892 entry = None
893 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
894 playlist_url = playlist.get('%sUrl' % key)
895 if not playlist_url:
896 continue
897 try:
a7e5f274
RA
898 info = self._extract_from_playlist_sxml(
899 playlist_url, playlist_id, timestamp)
900 if not entry:
901 entry = info
902 else:
903 entry['title'] = info['title']
904 entry['formats'].extend(info['formats'])
05087d1b
S
905 except Exception as e:
906 # Some playlist URL may fail with 500, at the same time
907 # the other one may work fine (e.g.
908 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
909 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
910 continue
911 raise
a7e5f274
RA
912 if entry:
913 self._sort_formats(entry['formats'])
914 entries.append(entry)
78f9d843
S
915
916 if entries:
78f9d843
S
917 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
918
919 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
920 programme_id = self._search_regex(
a1cf3e38 921 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
922 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
923 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 924 webpage, 'vpid', default=None)
dab062fb 925
9afa1770
S
926 if programme_id:
927 formats, subtitles = self._download_media_selector(programme_id)
928 self._sort_formats(formats)
929 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
930 digital_data = self._parse_json(
931 self._search_regex(
932 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
933 programme_id, fatal=False)
934 page_info = digital_data.get('page', {}).get('pageInfo', {})
935 title = page_info.get('pageName') or self._og_search_title(webpage)
936 description = page_info.get('description') or self._og_search_description(webpage)
937 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
938 return {
939 'id': programme_id,
940 'title': title,
941 'description': description,
942 'timestamp': timestamp,
943 'formats': formats,
944 'subtitles': subtitles,
945 }
a3bfddfa 946
9fb64c04
S
947 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
948 # There are several setPayload calls may be present but the video
949 # seems to be always related to the first one
950 morph_payload = self._parse_json(
951 self._search_regex(
952 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
953 webpage, 'morph payload', default='{}'),
954 playlist_id, fatal=False)
955 if morph_payload:
956 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
957 for component in components:
958 if not isinstance(component, dict):
959 continue
960 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
961 if not lead_media:
962 continue
963 identifiers = lead_media.get('identifiers')
964 if not identifiers or not isinstance(identifiers, dict):
965 continue
966 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
967 if not programme_id:
968 continue
969 title = lead_media.get('title') or self._og_search_title(webpage)
970 formats, subtitles = self._download_media_selector(programme_id)
971 self._sort_formats(formats)
972 description = lead_media.get('summary')
973 uploader = lead_media.get('masterBrand')
974 uploader_id = lead_media.get('mid')
975 duration = None
976 duration_d = lead_media.get('duration')
977 if isinstance(duration_d, dict):
978 duration = parse_duration(dict_get(
979 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
980 return {
981 'id': programme_id,
982 'title': title,
983 'description': description,
984 'duration': duration,
985 'uploader': uploader,
986 'uploader_id': uploader_id,
987 'formats': formats,
988 'subtitles': subtitles,
989 }
990
88ed52ae
S
991 def extract_all(pattern):
992 return list(filter(None, map(
993 lambda s: self._parse_json(s, playlist_id, fatal=False),
994 re.findall(pattern, webpage))))
995
996 # Multiple video article (e.g.
997 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 998 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
999 entries = []
1000 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1001 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1002 if embed_url and re.match(EMBED_URL, embed_url):
1003 entries.append(embed_url)
1004 entries.extend(re.findall(
1005 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1006 if entries:
1007 return self.playlist_result(
aaa42cf0 1008 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1009 playlist_id, playlist_title, playlist_description)
9afa1770
S
1010
1011 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1012 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1013
1014 if not medias:
1015 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1016 media_asset = self._search_regex(
1017 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1018 webpage, 'media asset', default=None)
1019 if media_asset:
1020 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1021 medias = []
1022 for video in media_asset_page.get('videos', {}).values():
1023 medias.extend(video.values())
1024
1025 if not medias:
1026 # Multiple video playlist with single `now playing` entry (e.g.
1027 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1028 vxp_playlist = self._parse_json(
9afa1770 1029 self._search_regex(
a346b1ff
S
1030 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1031 webpage, 'playlist data'),
9afa1770 1032 playlist_id)
a346b1ff
S
1033 playlist_medias = []
1034 for item in vxp_playlist:
1035 media = item.get('media')
1036 if not media:
1037 continue
1038 playlist_medias.append(media)
1039 # Download single video if found media with asset id matching the video id from URL
1040 if item.get('advert', {}).get('assetId') == playlist_id:
1041 medias = [media]
1042 break
1043 # Fallback to the whole playlist
1044 if not medias:
1045 medias = playlist_medias
9afa1770
S
1046
1047 entries = []
1048 for num, media_meta in enumerate(medias, start=1):
1049 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1050 if not formats:
1051 continue
10273d6e 1052 self._sort_formats(formats)
1053
9afa1770
S
1054 video_id = media_meta.get('externalId')
1055 if not video_id:
1056 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1057
1058 title = media_meta.get('caption')
1059 if not title:
1060 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1061
1062 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1063
9afa1770
S
1064 images = []
1065 for image in media_meta.get('images', {}).values():
1066 images.extend(image.values())
1067 if 'image' in media_meta:
1068 images.append(media_meta['image'])
1069
1070 thumbnails = [{
1071 'url': image.get('href'),
1072 'width': int_or_none(image.get('width')),
1073 'height': int_or_none(image.get('height')),
1074 } for image in images]
1075
1076 entries.append({
1077 'id': video_id,
10273d6e 1078 'title': title,
9afa1770 1079 'thumbnails': thumbnails,
10273d6e 1080 'duration': duration,
9afa1770 1081 'timestamp': timestamp,
10273d6e 1082 'formats': formats,
1083 'subtitles': subtitles,
a3bfddfa 1084 })
10273d6e 1085
9afa1770 1086 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1087
1088
1089class BBCCoUkArticleIE(InfoExtractor):
92519402 1090 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1091 IE_NAME = 'bbc.co.uk:article'
1092 IE_DESC = 'BBC articles'
1093
1094 _TEST = {
1095 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1096 'info_dict': {
1097 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1098 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1099 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1100 },
1101 'playlist_count': 4,
1102 'add_ie': ['BBCCoUk'],
1103 }
1104
1105 def _real_extract(self, url):
1106 playlist_id = self._match_id(url)
1107
1108 webpage = self._download_webpage(url, playlist_id)
1109
1110 title = self._og_search_title(webpage)
1111 description = self._og_search_description(webpage).strip()
1112
1113 entries = [self.url_result(programme_url) for programme_url in re.findall(
1114 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1115
1116 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1117
1118
1119class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1120 def _entries(self, webpage, url, playlist_id):
1121 single_page = 'page' in compat_urlparse.parse_qs(
1122 compat_urlparse.urlparse(url).query)
1123 for page_num in itertools.count(2):
1124 for video_id in re.findall(
1125 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1126 yield self.url_result(
1127 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1128 if single_page:
1129 return
1130 next_page = self._search_regex(
1131 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1132 webpage, 'next page url', default=None, group='url')
1133 if not next_page:
1134 break
1135 webpage = self._download_webpage(
1136 compat_urlparse.urljoin(url, next_page), playlist_id,
1137 'Downloading page %d' % page_num, page_num)
1138
ded7511a
S
1139 def _real_extract(self, url):
1140 playlist_id = self._match_id(url)
1141
1142 webpage = self._download_webpage(url, playlist_id)
1143
ded7511a
S
1144 title, description = self._extract_title_and_description(webpage)
1145
254e64a2
S
1146 return self.playlist_result(
1147 self._entries(webpage, url, playlist_id),
1148 playlist_id, title, description)
ded7511a
S
1149
1150
1151class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1152 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1153 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1154 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1155 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1156 _TESTS = [{
ded7511a
S
1157 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1158 'info_dict': {
1159 'id': 'b05rcz9v',
1160 'title': 'The Disappearance',
1161 'description': 'French thriller serial about a missing teenager.',
1162 },
1163 'playlist_mincount': 6,
c6668e4a 1164 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1165 }, {
1166 # Available for over a year unlike 30 days for most other programmes
1167 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1168 'info_dict': {
1169 'id': 'p02tcc32',
1170 'title': 'Bohemian Icons',
1171 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1172 },
1173 'playlist_mincount': 10,
1174 }]
ded7511a
S
1175
1176 def _extract_title_and_description(self, webpage):
1177 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1178 description = self._search_regex(
1179 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1180 webpage, 'description', fatal=False, group='value')
1181 return title, description
1182
1183
1184class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1185 IE_NAME = 'bbc.co.uk:playlist'
1186 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1187 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1188 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1189 _TESTS = [{
1190 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1191 'info_dict': {
1192 'id': 'b05rcz9v',
1193 'title': 'The Disappearance - Clips - BBC Four',
1194 'description': 'French thriller serial about a missing teenager.',
1195 },
1196 'playlist_mincount': 7,
4f640f28
S
1197 }, {
1198 # multipage playlist, explicit page
1199 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1200 'info_dict': {
1201 'id': 'b00mfl7n',
1202 'title': 'Frozen Planet - Clips - BBC One',
1203 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1204 },
1205 'playlist_mincount': 24,
1206 }, {
1207 # multipage playlist, all pages
1208 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1209 'info_dict': {
1210 'id': 'b00mfl7n',
1211 'title': 'Frozen Planet - Clips - BBC One',
1212 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1213 },
1214 'playlist_mincount': 142,
ded7511a
S
1215 }, {
1216 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1217 'only_matching': True,
1218 }, {
1219 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1220 'only_matching': True,
1221 }, {
1222 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1223 'only_matching': True,
1224 }]
1225
1226 def _extract_title_and_description(self, webpage):
1227 title = self._og_search_title(webpage, fatal=False)
1228 description = self._og_search_description(webpage)
1229 return title, description