]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[nexx] Improve JS embed extraction
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
9afa1770
S
15 parse_duration,
16 parse_iso8601,
9fb64c04 17 try_get,
dab062fb 18 unescapeHTML,
97067db2
S
19 urlencode_postdata,
20 urljoin,
8683b4d8 21)
36e6f62c
JMF
22from ..compat import (
23 compat_etree_fromstring,
24 compat_HTTPError,
254e64a2 25 compat_urlparse,
36e6f62c 26)
082c6c86 27
d12a1a47 28
f13b1e7d 29class BBCCoUkIE(InfoExtractor):
082c6c86 30 IE_NAME = 'bbc.co.uk'
2e3fd9ec 31 IE_DESC = 'BBC iPlayer'
22d7368d 32 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:www\.)?bbc\.co\.uk/
36 (?:
37 programmes/(?!articles/)|
38 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 39 music/(?:clips|audiovideo/popular)[/#]|
f20a11ed
S
40 radio/player/
41 )
ded7511a 42 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 43 ''' % _ID_REGEX
082c6c86 44
97067db2
S
45 _LOGIN_URL = 'https://account.bbc.com/signin'
46 _NETRC_MACHINE = 'bbc'
47
d12a1a47 48 _MEDIASELECTOR_URLS = [
26ccc68b
S
49 # Provides HQ HLS streams with even better quality that pc mediaset but fails
50 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 51 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 52 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
53 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
54 ]
a8b081a0 55
e6174ee9
S
56 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
57 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
58
59 _NAMESPACES = (
60 _MEDIASELECTION_NS,
61 _EMP_PLAYLIST_NS,
62 )
63
2e3fd9ec
S
64 _TESTS = [
65 {
f2d0fc68 66 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 67 'info_dict': {
f2d0fc68 68 'id': 'b039d07m',
b1ea6802 69 'ext': 'flv',
679bacf0 70 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 71 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
72 },
73 'params': {
b1ea6802 74 # rtmp download
2e3fd9ec
S
75 'skip_download': True,
76 }
082c6c86 77 },
2e3fd9ec
S
78 {
79 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
80 'info_dict': {
81 'id': 'b00yng1d',
82 'ext': 'flv',
83 'title': 'The Man in Black: Series 3: The Printed Name',
84 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
85 'duration': 1800,
86 },
87 'params': {
88 # rtmp download
89 'skip_download': True,
c7f0177f
S
90 },
91 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
92 },
93 {
94 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
95 'info_dict': {
96 'id': 'b00yng1d',
97 'ext': 'flv',
17968e44 98 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 99 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 100 'duration': 5100,
2e3fd9ec
S
101 },
102 'params': {
103 # rtmp download
104 'skip_download': True,
105 },
b1ea6802 106 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
107 },
108 {
109 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
110 'info_dict': {
111 'id': 'b03k3pb7',
112 'ext': 'flv',
113 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
114 'description': '2. Invasion',
115 'duration': 3600,
116 },
117 'params': {
118 # rtmp download
119 'skip_download': True,
120 },
b1ea6802 121 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
122 }, {
123 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
124 'info_dict': {
125 'id': 'b04v209v',
126 'ext': 'flv',
127 'title': 'Pete Tong, The Essential New Tune Special',
128 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
129 'duration': 10800,
130 },
131 'params': {
132 # rtmp download
133 'skip_download': True,
a3ef0e1c
YCH
134 },
135 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 136 }, {
5aa535c3 137 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
138 'note': 'Audio',
139 'info_dict': {
5aa535c3 140 'id': 'p022h44j',
b1ea6802 141 'ext': 'flv',
5aa535c3
S
142 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
143 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
144 'duration': 227,
c7e67594
S
145 },
146 'params': {
b1ea6802 147 # rtmp download
c7e67594
S
148 'skip_download': True,
149 }
150 }, {
151 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
152 'note': 'Video',
153 'info_dict': {
154 'id': 'p025c103',
b1ea6802 155 'ext': 'flv',
c7e67594
S
156 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
157 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
158 'duration': 226,
159 },
160 'params': {
b1ea6802 161 # rtmp download
c7e67594
S
162 'skip_download': True,
163 }
e68ae99a
S
164 }, {
165 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
166 'info_dict': {
167 'id': 'p02n76xf',
168 'ext': 'flv',
169 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
170 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
171 'duration': 3540,
172 },
173 'params': {
174 # rtmp download
175 'skip_download': True,
176 },
b1ea6802 177 'skip': 'geolocation',
25fa8d66
YCH
178 }, {
179 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
180 'info_dict': {
181 'id': 'b05zmgw1',
182 'ext': 'flv',
183 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
184 'title': 'Royal Academy Summer Exhibition',
185 'duration': 3540,
186 },
187 'params': {
188 # rtmp download
189 'skip_download': True,
190 },
b1ea6802 191 'skip': 'geolocation',
54914380
S
192 }, {
193 # iptv-all mediaset fails with geolocation however there is no geo restriction
194 # for this programme at all
5aa535c3 195 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 196 'info_dict': {
5aa535c3 197 'id': 'b06rkms3',
54914380 198 'ext': 'flv',
5aa535c3
S
199 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
200 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
201 },
202 'params': {
203 # rtmp download
204 'skip_download': True,
205 },
b1ea6802 206 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
207 }, {
208 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
209 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
210 'info_dict': {
211 'id': 'p028bfkj',
b1ea6802 212 'ext': 'flv',
1ac6e794
S
213 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
214 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 },
216 'params': {
b1ea6802 217 # rtmp download
1ac6e794
S
218 'skip_download': True,
219 },
31763975
S
220 }, {
221 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
222 'only_matching': True,
c7e67594
S
223 }, {
224 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
225 'only_matching': True,
0692ef86
S
226 }, {
227 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
228 'only_matching': True,
f20a11ed
S
229 }, {
230 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
231 'only_matching': True,
72d256c4
S
232 }, {
233 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
234 'only_matching': True,
235 }]
2e3fd9ec 236
97eb9bd2
RA
237 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
238
97067db2
S
239 def _login(self):
240 username, password = self._get_login_info()
241 if username is None:
242 return
243
244 login_page = self._download_webpage(
245 self._LOGIN_URL, None, 'Downloading signin page')
246
247 login_form = self._hidden_inputs(login_page)
248
249 login_form.update({
250 'username': username,
251 'password': password,
252 })
253
254 post_url = urljoin(self._LOGIN_URL, self._search_regex(
255 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
256 'post url', default=self._LOGIN_URL, group='url'))
257
258 response, urlh = self._download_webpage_handle(
259 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
260 headers={'Referer': self._LOGIN_URL})
261
262 if self._LOGIN_URL in urlh.geturl():
263 error = clean_html(get_element_by_class('form-message', response))
264 if error:
265 raise ExtractorError(
266 'Unable to login: %s' % error, expected=True)
267 raise ExtractorError('Unable to log in')
268
269 def _real_initialize(self):
270 self._login()
271
d12a1a47
S
272 class MediaSelectionError(Exception):
273 def __init__(self, id):
274 self.id = id
275
2e3fd9ec
S
276 def _extract_asx_playlist(self, connection, programme_id):
277 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
278 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
279
2e3fd9ec 280 def _extract_items(self, playlist):
e6174ee9
S
281 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
282
283 def _findall_ns(self, element, xpath):
284 elements = []
285 for ns in self._NAMESPACES:
286 elements.extend(element.findall(xpath % ns))
287 return elements
2e3fd9ec
S
288
289 def _extract_medias(self, media_selection):
e6174ee9
S
290 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
291 if error is None:
292 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 293 if error is not None:
d12a1a47 294 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 295 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
296
297 def _extract_connections(self, media):
e6174ee9 298 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 299
f13b1e7d 300 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
301 subtitles = {}
302 for connection in self._extract_connections(media):
303 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
304 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
305 subtitles[lang] = [
306 {
307 'url': connection.get('href'),
308 'ext': 'ttml',
309 },
f13b1e7d 310 ]
2e3fd9ec 311 return subtitles
082c6c86 312
d12a1a47
S
313 def _raise_extractor_error(self, media_selection_error):
314 raise ExtractorError(
315 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
316 expected=True)
317
c056efa2 318 def _download_media_selector(self, programme_id):
d12a1a47
S
319 last_exception = None
320 for mediaselector_url in self._MEDIASELECTOR_URLS:
321 try:
322 return self._download_media_selector_url(
323 mediaselector_url % programme_id, programme_id)
324 except BBCCoUkIE.MediaSelectionError as e:
d781e293 325 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
326 last_exception = e
327 continue
328 self._raise_extractor_error(e)
329 self._raise_extractor_error(last_exception)
9afa1770
S
330
331 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
332 try:
333 media_selection = self._download_xml(
9afa1770 334 url, programme_id, 'Downloading media selection XML')
c056efa2 335 except ExtractorError as ee:
d781e293 336 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 337 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 338 else:
c056efa2 339 raise
9afa1770 340 return self._process_media_selector(media_selection, programme_id)
082c6c86 341
9afa1770 342 def _process_media_selector(self, media_selection, programme_id):
082c6c86 343 formats = []
2e3fd9ec 344 subtitles = None
b0af1215 345 urls = []
2e3fd9ec 346
c056efa2
S
347 for media in self._extract_medias(media_selection):
348 kind = media.get('kind')
a7e5f274
RA
349 if kind in ('video', 'audio'):
350 bitrate = int_or_none(media.get('bitrate'))
351 encoding = media.get('encoding')
352 service = media.get('service')
353 width = int_or_none(media.get('width'))
354 height = int_or_none(media.get('height'))
355 file_size = int_or_none(media.get('media_file_size'))
356 for connection in self._extract_connections(media):
b0af1215
RA
357 href = connection.get('href')
358 if href in urls:
359 continue
360 if href:
361 urls.append(href)
a7e5f274
RA
362 conn_kind = connection.get('kind')
363 protocol = connection.get('protocol')
364 supplier = connection.get('supplier')
a7e5f274
RA
365 transfer_format = connection.get('transferFormat')
366 format_id = supplier or conn_kind or protocol
367 if service:
368 format_id = '%s_%s' % (service, format_id)
369 # ASX playlist
370 if supplier == 'asx':
371 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
372 formats.append({
373 'url': ref,
374 'format_id': 'ref%s_%s' % (i, format_id),
375 })
376 elif transfer_format == 'dash':
377 formats.extend(self._extract_mpd_formats(
378 href, programme_id, mpd_id=format_id, fatal=False))
379 elif transfer_format == 'hls':
380 formats.extend(self._extract_m3u8_formats(
381 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
382 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
383 if re.search(self._USP_RE, href):
384 usp_formats = self._extract_m3u8_formats(
385 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
386 programme_id, ext='mp4', entry_protocol='m3u8_native',
387 m3u8_id=format_id, fatal=False)
388 for f in usp_formats:
389 if f.get('height') and f['height'] > 720:
390 continue
391 formats.append(f)
a7e5f274
RA
392 elif transfer_format == 'hds':
393 formats.extend(self._extract_f4m_formats(
394 href, programme_id, f4m_id=format_id, fatal=False))
395 else:
f9622868 396 if not service and not supplier and bitrate:
aaa42cf0 397 format_id += '-%d' % bitrate
a7e5f274
RA
398 fmt = {
399 'format_id': format_id,
400 'filesize': file_size,
401 }
402 if kind == 'video':
403 fmt.update({
404 'width': width,
405 'height': height,
6240925b 406 'tbr': bitrate,
a7e5f274
RA
407 'vcodec': encoding,
408 })
409 else:
410 fmt.update({
411 'abr': bitrate,
412 'acodec': encoding,
413 'vcodec': 'none',
414 })
1af959ef 415 if protocol in ('http', 'https'):
a7e5f274
RA
416 # Direct link
417 fmt.update({
418 'url': href,
419 })
420 elif protocol == 'rtmp':
421 application = connection.get('application', 'ondemand')
422 auth_string = connection.get('authString')
423 identifier = connection.get('identifier')
424 server = connection.get('server')
425 fmt.update({
426 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
427 'play_path': identifier,
428 'app': '%s?%s' % (application, auth_string),
429 'page_url': 'http://www.bbc.co.uk',
430 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
431 'rtmp_live': False,
432 'ext': 'flv',
433 })
964744af
S
434 else:
435 continue
a7e5f274 436 formats.append(fmt)
c056efa2 437 elif kind == 'captions':
f13b1e7d 438 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 439 return formats, subtitles
2e3fd9ec 440
ae6986fb
S
441 def _download_playlist(self, playlist_id):
442 try:
443 playlist = self._download_json(
444 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
445 playlist_id, 'Downloading playlist JSON')
446
447 version = playlist.get('defaultAvailableVersion')
448 if version:
449 smp_config = version['smpConfig']
450 title = smp_config['title']
451 description = smp_config['summary']
452 for item in smp_config['items']:
453 kind = item['kind']
40fcba5e 454 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
455 continue
456 programme_id = item.get('vpid')
d97f5cd7 457 duration = int_or_none(item.get('duration'))
ae6986fb
S
458 formats, subtitles = self._download_media_selector(programme_id)
459 return programme_id, title, description, duration, formats, subtitles
460 except ExtractorError as ee:
f813928e 461 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
462 raise
463
464 # fallback to legacy playlist
9afa1770
S
465 return self._process_legacy_playlist(playlist_id)
466
467 def _process_legacy_playlist_url(self, url, display_id):
468 playlist = self._download_legacy_playlist_url(url, display_id)
469 return self._extract_from_legacy_playlist(playlist, display_id)
470
471 def _process_legacy_playlist(self, playlist_id):
472 return self._process_legacy_playlist_url(
473 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
474
475 def _download_legacy_playlist_url(self, url, playlist_id=None):
476 return self._download_xml(
477 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 478
9afa1770 479 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 480 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
481 if no_items is not None:
482 reason = no_items.get('reason')
483 if reason == 'preAvailability':
484 msg = 'Episode %s is not yet available' % playlist_id
485 elif reason == 'postAvailability':
486 msg = 'Episode %s is no longer available' % playlist_id
487 elif reason == 'noMedia':
488 msg = 'Episode %s is not currently available' % playlist_id
489 else:
490 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
491 raise ExtractorError(msg, expected=True)
492
493 for item in self._extract_items(playlist):
494 kind = item.get('kind')
40fcba5e 495 if kind not in ('programme', 'radioProgramme'):
ae6986fb 496 continue
e6174ee9
S
497 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
498 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 499 description = description_el.text if description_el is not None else None
9afa1770
S
500
501 def get_programme_id(item):
502 def get_from_attributes(item):
503 for p in('identifier', 'group'):
504 value = item.get(p)
505 if value and re.match(r'^[pb][\da-z]{7}$', value):
506 return value
507 get_from_attributes(item)
e6174ee9 508 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
509 if mediator is not None:
510 return get_from_attributes(mediator)
511
512 programme_id = get_programme_id(item)
d97f5cd7 513 duration = int_or_none(item.get('duration'))
e6174ee9
S
514
515 if programme_id:
516 formats, subtitles = self._download_media_selector(programme_id)
517 else:
518 formats, subtitles = self._process_media_selector(item, playlist_id)
519 programme_id = playlist_id
ae6986fb
S
520
521 return programme_id, title, description, duration, formats, subtitles
522
c056efa2
S
523 def _real_extract(self, url):
524 group_id = self._match_id(url)
525
526 webpage = self._download_webpage(url, group_id, 'Downloading video page')
527
b2ed954f
S
528 error = self._search_regex(
529 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
530 webpage, 'error', default=None)
531 if error:
532 raise ExtractorError(error, expected=True)
533
8683b4d8 534 programme_id = None
679bacf0 535 duration = None
8683b4d8
S
536
537 tviplayer = self._search_regex(
538 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
539 webpage, 'player', default=None)
540
541 if tviplayer:
542 player = self._parse_json(tviplayer, group_id).get('player', {})
543 duration = int_or_none(player.get('duration'))
544 programme_id = player.get('vpid')
545
546 if not programme_id:
547 programme_id = self._search_regex(
22d7368d 548 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 549
c056efa2 550 if programme_id:
c056efa2 551 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 552 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
553 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
554 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 555 description = self._search_regex(
a8534274
S
556 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
557 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
558 webpage, 'description', default=None)
559 if not description:
560 description = self._html_search_meta('description', webpage)
c056efa2 561 else:
ae6986fb 562 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 563
082c6c86
S
564 self._sort_formats(formats)
565
566 return {
2e3fd9ec 567 'id': programme_id,
082c6c86
S
568 'title': title,
569 'description': description,
650cfd0c 570 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
571 'duration': duration,
572 'formats': formats,
2e3fd9ec 573 'subtitles': subtitles,
5f6a1245 574 }
10273d6e 575
576
9afa1770
S
577class BBCIE(BBCCoUkIE):
578 IE_NAME = 'bbc'
579 IE_DESC = 'BBC'
580 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 581
d12a1a47 582 _MEDIASELECTOR_URLS = [
55ebae26
S
583 # Provides HQ HLS streams but fails with geolocation in some cases when it's
584 # even not geo restricted at all
585 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
586 # Provides more formats, namely direct mp4 links, but fails on some videos with
587 # notukerror for non UK (?) users (e.g.
588 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
589 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
590 # Provides fewer formats, but works everywhere for everybody (hopefully)
591 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
592 ]
10273d6e 593
594 _TESTS = [{
6a747190 595 # article with multiple videos embedded with data-playable containing vpids
10273d6e 596 'url': 'http://www.bbc.com/news/world-europe-32668511',
597 'info_dict': {
598 'id': 'world-europe-32668511',
599 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 600 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 601 },
602 'playlist_count': 2,
a3bfddfa 603 }, {
6a747190 604 # article with multiple videos embedded with data-playable (more videos)
10273d6e 605 'url': 'http://www.bbc.com/news/business-28299555',
606 'info_dict': {
607 'id': 'business-28299555',
608 'title': 'Farnborough Airshow: Video highlights',
9afa1770 609 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 610 },
611 'playlist_count': 9,
9afa1770 612 'skip': 'Save time',
88ed52ae
S
613 }, {
614 # article with multiple videos embedded with `new SMP()`
6a747190 615 # broken
88ed52ae
S
616 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
617 'info_dict': {
618 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 619 'title': 'BUGGER',
88ed52ae
S
620 },
621 'playlist_count': 18,
a3bfddfa 622 }, {
6a747190 623 # single video embedded with data-playable containing vpid
10273d6e 624 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 625 'info_dict': {
626 'id': 'p02mprgb',
55ebae26 627 'ext': 'mp4',
10273d6e 628 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 629 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 630 'duration': 47,
9afa1770 631 'timestamp': 1427219242,
da92eeae 632 'upload_date': '20150324',
10273d6e 633 },
634 'params': {
9afa1770 635 # rtmp download
10273d6e 636 'skip_download': True,
637 }
a3bfddfa 638 }, {
6a747190
S
639 # article with single video embedded with data-playable containing XML playlist
640 # with direct video links as progressiveDownloadUrl (for now these are extracted)
641 # and playlist with f4m and m3u8 as streamingUrl
de939d89 642 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 643 'info_dict': {
9afa1770 644 'id': '150615_telabyad_kentin_cogu',
de939d89 645 'ext': 'mp4',
ad152e2d 646 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 647 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 648 'timestamp': 1434397334,
da92eeae 649 'upload_date': '20150615',
de939d89 650 },
651 'params': {
652 'skip_download': True,
653 }
c936d8cc 654 }, {
6a747190 655 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 656 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 657 'info_dict': {
9afa1770 658 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 659 'ext': 'mp4',
9afa1770 660 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 661 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 662 'timestamp': 1434713142,
da92eeae 663 'upload_date': '20150619',
de939d89 664 },
665 'params': {
666 'skip_download': True,
667 }
a346b1ff
S
668 }, {
669 # single video from video playlist embedded with vxp-playlist-data JSON
670 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
671 'info_dict': {
672 'id': 'p02w6qjc',
55ebae26 673 'ext': 'mp4',
a346b1ff
S
674 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
675 'duration': 56,
0bc4ee60 676 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
677 },
678 'params': {
679 'skip_download': True,
680 }
9afa1770
S
681 }, {
682 # single video story with digitalData
683 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
684 'info_dict': {
685 'id': 'p02q6gc4',
686 'ext': 'flv',
687 'title': 'Sri Lanka’s spicy secret',
688 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
689 'timestamp': 1437674293,
690 'upload_date': '20150723',
691 },
692 'params': {
693 # rtmp download
694 'skip_download': True,
695 }
696 }, {
697 # single video story without digitalData
698 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
699 'info_dict': {
700 'id': 'p018zqqg',
55ebae26 701 'ext': 'mp4',
9afa1770
S
702 'title': 'Hyundai Santa Fe Sport: Rock star',
703 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
704 'timestamp': 1415867444,
705 'upload_date': '20141113',
9afa1770
S
706 },
707 'params': {
708 # rtmp download
709 'skip_download': True,
710 }
9fb64c04
S
711 }, {
712 # single video embedded with Morph
713 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
714 'info_dict': {
715 'id': 'p041vhd0',
716 'ext': 'mp4',
717 'title': "Nigeria v Japan - Men's First Round",
718 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
719 'duration': 7980,
720 'uploader': 'BBC Sport',
721 'uploader_id': 'bbc_sport',
722 },
723 'params': {
724 # m3u8 download
725 'skip_download': True,
9fb64c04
S
726 },
727 'skip': 'Georestricted to UK',
9afa1770 728 }, {
6a747190 729 # single video with playlist.sxml URL in playlist param
9afa1770
S
730 'url': 'http://www.bbc.com/sport/0/football/33653409',
731 'info_dict': {
732 'id': 'p02xycnp',
55ebae26 733 'ext': 'mp4',
9afa1770 734 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 735 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
736 'duration': 140,
737 },
738 'params': {
739 # rtmp download
740 'skip_download': True,
741 }
b5d48cb1 742 }, {
6a747190 743 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
744 'url': 'http://www.bbc.com/sport/0/football/34475836',
745 'info_dict': {
746 'id': '34475836',
450b233c 747 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 748 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
749 },
750 'playlist_count': 3,
450b233c
S
751 }, {
752 # school report article with single video
753 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
754 'info_dict': {
755 'id': '35744779',
756 'title': 'School which breaks down barriers in Jerusalem',
757 },
758 'playlist_count': 1,
9afa1770
S
759 }, {
760 # single video with playlist URL from weather section
761 'url': 'http://www.bbc.com/weather/features/33601775',
762 'only_matching': True,
763 }, {
764 # custom redirection to www.bbc.com
765 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
766 'only_matching': True,
a1cf3e38
S
767 }, {
768 # single video article embedded with data-media-vpid
769 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
770 'only_matching': True,
10273d6e 771 }]
772
9afa1770
S
773 @classmethod
774 def suitable(cls, url):
ded7511a
S
775 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
776 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
777 else super(BBCIE, cls).suitable(url))
9afa1770
S
778
779 def _extract_from_media_meta(self, media_meta, video_id):
780 # Direct links to media in media metadata (e.g.
781 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
782 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
783 source_files = media_meta.get('sourceFiles')
784 if source_files:
785 return [{
786 'url': f['url'],
787 'format_id': format_id,
788 'ext': f.get('encoding'),
789 'tbr': float_or_none(f.get('bitrate'), 1000),
790 'filesize': int_or_none(f.get('filesize')),
791 } for format_id, f in source_files.items() if f.get('url')], []
792
793 programme_id = media_meta.get('externalId')
794 if programme_id:
795 return self._download_media_selector(programme_id)
796
797 # Process playlist.sxml as legacy playlist
798 href = media_meta.get('href')
799 if href:
800 playlist = self._download_legacy_playlist_url(href)
801 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
802 return formats, subtitles
803
804 return [], []
805
baf39a1a
S
806 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
807 programme_id, title, description, duration, formats, subtitles = \
808 self._process_legacy_playlist_url(url, playlist_id)
809 self._sort_formats(formats)
810 return {
811 'id': programme_id,
812 'title': title,
813 'description': description,
814 'duration': duration,
815 'timestamp': timestamp,
816 'formats': formats,
817 'subtitles': subtitles,
818 }
819
10273d6e 820 def _real_extract(self, url):
9afa1770
S
821 playlist_id = self._match_id(url)
822
823 webpage = self._download_webpage(url, playlist_id)
824
522f6c06 825 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 826 timestamp = json_ld_info.get('timestamp')
0e832c2c 827
350e02d4 828 playlist_title = json_ld_info.get('title')
0e832c2c
S
829 if not playlist_title:
830 playlist_title = self._og_search_title(
831 webpage, default=None) or self._html_search_regex(
832 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
833 if playlist_title:
834 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
835
836 playlist_description = json_ld_info.get(
837 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
838
839 if not timestamp:
840 timestamp = parse_iso8601(self._search_regex(
841 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
842 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 843 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 844 webpage, 'date', default=None))
9afa1770 845
78f9d843
S
846 entries = []
847
de665713
S
848 # article with multiple videos embedded with playlist.sxml (e.g.
849 # http://www.bbc.com/sport/0/football/34475836)
850 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 851 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 852 if playlists:
baf39a1a
S
853 entries = [
854 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
855 for playlist_url in playlists]
de939d89 856
78f9d843
S
857 # news article with multiple videos embedded with data-playable
858 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
859 if data_playables:
860 for _, data_playable_json in data_playables:
861 data_playable = self._parse_json(
862 unescapeHTML(data_playable_json), playlist_id, fatal=False)
863 if not data_playable:
864 continue
baf39a1a
S
865 settings = data_playable.get('settings', {})
866 if settings:
78f9d843
S
867 # data-playable with video vpid in settings.playlistObject.items (e.g.
868 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
869 playlist_object = settings.get('playlistObject', {})
870 if playlist_object:
871 items = playlist_object.get('items')
872 if items and isinstance(items, list):
78f9d843
S
873 title = playlist_object['title']
874 description = playlist_object.get('summary')
baf39a1a
S
875 duration = int_or_none(items[0].get('duration'))
876 programme_id = items[0].get('vpid')
78f9d843
S
877 formats, subtitles = self._download_media_selector(programme_id)
878 self._sort_formats(formats)
879 entries.append({
880 'id': programme_id,
881 'title': title,
882 'description': description,
883 'timestamp': timestamp,
884 'duration': duration,
885 'formats': formats,
886 'subtitles': subtitles,
887 })
888 else:
889 # data-playable without vpid but with a playlist.sxml URLs
890 # in otherSettings.playlist (e.g.
891 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
892 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
893 if playlist:
a7e5f274
RA
894 entry = None
895 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
896 playlist_url = playlist.get('%sUrl' % key)
897 if not playlist_url:
898 continue
899 try:
a7e5f274
RA
900 info = self._extract_from_playlist_sxml(
901 playlist_url, playlist_id, timestamp)
902 if not entry:
903 entry = info
904 else:
905 entry['title'] = info['title']
906 entry['formats'].extend(info['formats'])
05087d1b
S
907 except Exception as e:
908 # Some playlist URL may fail with 500, at the same time
909 # the other one may work fine (e.g.
910 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
911 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
912 continue
913 raise
a7e5f274
RA
914 if entry:
915 self._sort_formats(entry['formats'])
916 entries.append(entry)
78f9d843
S
917
918 if entries:
78f9d843
S
919 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
920
921 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
922 programme_id = self._search_regex(
a1cf3e38 923 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
924 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
925 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 926 webpage, 'vpid', default=None)
dab062fb 927
9afa1770
S
928 if programme_id:
929 formats, subtitles = self._download_media_selector(programme_id)
930 self._sort_formats(formats)
931 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
932 digital_data = self._parse_json(
933 self._search_regex(
934 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
935 programme_id, fatal=False)
936 page_info = digital_data.get('page', {}).get('pageInfo', {})
937 title = page_info.get('pageName') or self._og_search_title(webpage)
938 description = page_info.get('description') or self._og_search_description(webpage)
939 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
940 return {
941 'id': programme_id,
942 'title': title,
943 'description': description,
944 'timestamp': timestamp,
945 'formats': formats,
946 'subtitles': subtitles,
947 }
a3bfddfa 948
9fb64c04
S
949 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
950 # There are several setPayload calls may be present but the video
951 # seems to be always related to the first one
952 morph_payload = self._parse_json(
953 self._search_regex(
954 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
955 webpage, 'morph payload', default='{}'),
956 playlist_id, fatal=False)
957 if morph_payload:
958 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
959 for component in components:
960 if not isinstance(component, dict):
961 continue
962 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
963 if not lead_media:
964 continue
965 identifiers = lead_media.get('identifiers')
966 if not identifiers or not isinstance(identifiers, dict):
967 continue
968 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
969 if not programme_id:
970 continue
971 title = lead_media.get('title') or self._og_search_title(webpage)
972 formats, subtitles = self._download_media_selector(programme_id)
973 self._sort_formats(formats)
974 description = lead_media.get('summary')
975 uploader = lead_media.get('masterBrand')
976 uploader_id = lead_media.get('mid')
977 duration = None
978 duration_d = lead_media.get('duration')
979 if isinstance(duration_d, dict):
980 duration = parse_duration(dict_get(
981 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
982 return {
983 'id': programme_id,
984 'title': title,
985 'description': description,
986 'duration': duration,
987 'uploader': uploader,
988 'uploader_id': uploader_id,
989 'formats': formats,
990 'subtitles': subtitles,
991 }
992
88ed52ae
S
993 def extract_all(pattern):
994 return list(filter(None, map(
995 lambda s: self._parse_json(s, playlist_id, fatal=False),
996 re.findall(pattern, webpage))))
997
998 # Multiple video article (e.g.
999 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1000 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1001 entries = []
1002 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1003 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1004 if embed_url and re.match(EMBED_URL, embed_url):
1005 entries.append(embed_url)
1006 entries.extend(re.findall(
1007 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1008 if entries:
1009 return self.playlist_result(
aaa42cf0 1010 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1011 playlist_id, playlist_title, playlist_description)
9afa1770
S
1012
1013 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1014 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1015
1016 if not medias:
1017 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1018 media_asset = self._search_regex(
1019 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1020 webpage, 'media asset', default=None)
1021 if media_asset:
1022 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1023 medias = []
1024 for video in media_asset_page.get('videos', {}).values():
1025 medias.extend(video.values())
1026
1027 if not medias:
1028 # Multiple video playlist with single `now playing` entry (e.g.
1029 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1030 vxp_playlist = self._parse_json(
9afa1770 1031 self._search_regex(
a346b1ff
S
1032 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1033 webpage, 'playlist data'),
9afa1770 1034 playlist_id)
a346b1ff
S
1035 playlist_medias = []
1036 for item in vxp_playlist:
1037 media = item.get('media')
1038 if not media:
1039 continue
1040 playlist_medias.append(media)
1041 # Download single video if found media with asset id matching the video id from URL
1042 if item.get('advert', {}).get('assetId') == playlist_id:
1043 medias = [media]
1044 break
1045 # Fallback to the whole playlist
1046 if not medias:
1047 medias = playlist_medias
9afa1770
S
1048
1049 entries = []
1050 for num, media_meta in enumerate(medias, start=1):
1051 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1052 if not formats:
1053 continue
10273d6e 1054 self._sort_formats(formats)
1055
9afa1770
S
1056 video_id = media_meta.get('externalId')
1057 if not video_id:
1058 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1059
1060 title = media_meta.get('caption')
1061 if not title:
1062 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1063
1064 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1065
9afa1770
S
1066 images = []
1067 for image in media_meta.get('images', {}).values():
1068 images.extend(image.values())
1069 if 'image' in media_meta:
1070 images.append(media_meta['image'])
1071
1072 thumbnails = [{
1073 'url': image.get('href'),
1074 'width': int_or_none(image.get('width')),
1075 'height': int_or_none(image.get('height')),
1076 } for image in images]
1077
1078 entries.append({
1079 'id': video_id,
10273d6e 1080 'title': title,
9afa1770 1081 'thumbnails': thumbnails,
10273d6e 1082 'duration': duration,
9afa1770 1083 'timestamp': timestamp,
10273d6e 1084 'formats': formats,
1085 'subtitles': subtitles,
a3bfddfa 1086 })
10273d6e 1087
9afa1770 1088 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1089
1090
1091class BBCCoUkArticleIE(InfoExtractor):
92519402 1092 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1093 IE_NAME = 'bbc.co.uk:article'
1094 IE_DESC = 'BBC articles'
1095
1096 _TEST = {
1097 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1098 'info_dict': {
1099 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1100 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1101 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1102 },
1103 'playlist_count': 4,
1104 'add_ie': ['BBCCoUk'],
1105 }
1106
1107 def _real_extract(self, url):
1108 playlist_id = self._match_id(url)
1109
1110 webpage = self._download_webpage(url, playlist_id)
1111
1112 title = self._og_search_title(webpage)
1113 description = self._og_search_description(webpage).strip()
1114
1115 entries = [self.url_result(programme_url) for programme_url in re.findall(
1116 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1117
1118 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1119
1120
1121class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1122 def _entries(self, webpage, url, playlist_id):
1123 single_page = 'page' in compat_urlparse.parse_qs(
1124 compat_urlparse.urlparse(url).query)
1125 for page_num in itertools.count(2):
1126 for video_id in re.findall(
1127 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1128 yield self.url_result(
1129 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1130 if single_page:
1131 return
1132 next_page = self._search_regex(
1133 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1134 webpage, 'next page url', default=None, group='url')
1135 if not next_page:
1136 break
1137 webpage = self._download_webpage(
1138 compat_urlparse.urljoin(url, next_page), playlist_id,
1139 'Downloading page %d' % page_num, page_num)
1140
ded7511a
S
1141 def _real_extract(self, url):
1142 playlist_id = self._match_id(url)
1143
1144 webpage = self._download_webpage(url, playlist_id)
1145
ded7511a
S
1146 title, description = self._extract_title_and_description(webpage)
1147
254e64a2
S
1148 return self.playlist_result(
1149 self._entries(webpage, url, playlist_id),
1150 playlist_id, title, description)
ded7511a
S
1151
1152
1153class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1154 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1155 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1156 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1157 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1158 _TESTS = [{
ded7511a
S
1159 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1160 'info_dict': {
1161 'id': 'b05rcz9v',
1162 'title': 'The Disappearance',
1163 'description': 'French thriller serial about a missing teenager.',
1164 },
1165 'playlist_mincount': 6,
c6668e4a 1166 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1167 }, {
1168 # Available for over a year unlike 30 days for most other programmes
1169 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1170 'info_dict': {
1171 'id': 'p02tcc32',
1172 'title': 'Bohemian Icons',
1173 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1174 },
1175 'playlist_mincount': 10,
1176 }]
ded7511a
S
1177
1178 def _extract_title_and_description(self, webpage):
1179 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1180 description = self._search_regex(
1181 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1182 webpage, 'description', fatal=False, group='value')
1183 return title, description
1184
1185
1186class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1187 IE_NAME = 'bbc.co.uk:playlist'
1188 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1189 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1190 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1191 _TESTS = [{
1192 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1193 'info_dict': {
1194 'id': 'b05rcz9v',
1195 'title': 'The Disappearance - Clips - BBC Four',
1196 'description': 'French thriller serial about a missing teenager.',
1197 },
1198 'playlist_mincount': 7,
4f640f28
S
1199 }, {
1200 # multipage playlist, explicit page
1201 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1202 'info_dict': {
1203 'id': 'b00mfl7n',
1204 'title': 'Frozen Planet - Clips - BBC One',
1205 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1206 },
1207 'playlist_mincount': 24,
1208 }, {
1209 # multipage playlist, all pages
1210 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1211 'info_dict': {
1212 'id': 'b00mfl7n',
1213 'title': 'Frozen Planet - Clips - BBC One',
1214 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1215 },
1216 'playlist_mincount': 142,
ded7511a
S
1217 }, {
1218 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1219 'only_matching': True,
1220 }, {
1221 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1222 'only_matching': True,
1223 }, {
1224 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1225 'only_matching': True,
1226 }]
1227
1228 def _extract_title_and_description(self, webpage):
1229 title = self._og_search_title(webpage, fatal=False)
1230 description = self._og_search_description(webpage)
1231 return title, description