]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[cbsnews] Actualize test
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
9afa1770
S
15 parse_duration,
16 parse_iso8601,
9fb64c04 17 try_get,
dab062fb 18 unescapeHTML,
97067db2
S
19 urlencode_postdata,
20 urljoin,
8683b4d8 21)
36e6f62c
JMF
22from ..compat import (
23 compat_etree_fromstring,
24 compat_HTTPError,
254e64a2 25 compat_urlparse,
36e6f62c 26)
082c6c86 27
d12a1a47 28
f13b1e7d 29class BBCCoUkIE(InfoExtractor):
082c6c86 30 IE_NAME = 'bbc.co.uk'
2e3fd9ec 31 IE_DESC = 'BBC iPlayer'
22d7368d 32 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:www\.)?bbc\.co\.uk/
36 (?:
37 programmes/(?!articles/)|
38 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
39 music/clips[/#]|
40 radio/player/
41 )
ded7511a 42 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 43 ''' % _ID_REGEX
082c6c86 44
97067db2
S
45 _LOGIN_URL = 'https://account.bbc.com/signin'
46 _NETRC_MACHINE = 'bbc'
47
d12a1a47 48 _MEDIASELECTOR_URLS = [
26ccc68b
S
49 # Provides HQ HLS streams with even better quality that pc mediaset but fails
50 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 51 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 52 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
53 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
54 ]
a8b081a0 55
e6174ee9
S
56 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
57 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
58
59 _NAMESPACES = (
60 _MEDIASELECTION_NS,
61 _EMP_PLAYLIST_NS,
62 )
63
2e3fd9ec
S
64 _TESTS = [
65 {
f2d0fc68 66 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 67 'info_dict': {
f2d0fc68 68 'id': 'b039d07m',
b1ea6802 69 'ext': 'flv',
679bacf0 70 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 71 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
72 },
73 'params': {
b1ea6802 74 # rtmp download
2e3fd9ec
S
75 'skip_download': True,
76 }
082c6c86 77 },
2e3fd9ec
S
78 {
79 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
80 'info_dict': {
81 'id': 'b00yng1d',
82 'ext': 'flv',
83 'title': 'The Man in Black: Series 3: The Printed Name',
84 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
85 'duration': 1800,
86 },
87 'params': {
88 # rtmp download
89 'skip_download': True,
c7f0177f
S
90 },
91 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
92 },
93 {
94 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
95 'info_dict': {
96 'id': 'b00yng1d',
97 'ext': 'flv',
17968e44 98 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 99 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 100 'duration': 5100,
2e3fd9ec
S
101 },
102 'params': {
103 # rtmp download
104 'skip_download': True,
105 },
b1ea6802 106 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
107 },
108 {
109 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
110 'info_dict': {
111 'id': 'b03k3pb7',
112 'ext': 'flv',
113 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
114 'description': '2. Invasion',
115 'duration': 3600,
116 },
117 'params': {
118 # rtmp download
119 'skip_download': True,
120 },
b1ea6802 121 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
122 }, {
123 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
124 'info_dict': {
125 'id': 'b04v209v',
126 'ext': 'flv',
127 'title': 'Pete Tong, The Essential New Tune Special',
128 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
129 'duration': 10800,
130 },
131 'params': {
132 # rtmp download
133 'skip_download': True,
a3ef0e1c
YCH
134 },
135 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 136 }, {
5aa535c3 137 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
138 'note': 'Audio',
139 'info_dict': {
5aa535c3 140 'id': 'p022h44j',
b1ea6802 141 'ext': 'flv',
5aa535c3
S
142 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
143 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
144 'duration': 227,
c7e67594
S
145 },
146 'params': {
b1ea6802 147 # rtmp download
c7e67594
S
148 'skip_download': True,
149 }
150 }, {
151 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
152 'note': 'Video',
153 'info_dict': {
154 'id': 'p025c103',
b1ea6802 155 'ext': 'flv',
c7e67594
S
156 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
157 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
158 'duration': 226,
159 },
160 'params': {
b1ea6802 161 # rtmp download
c7e67594
S
162 'skip_download': True,
163 }
e68ae99a
S
164 }, {
165 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
166 'info_dict': {
167 'id': 'p02n76xf',
168 'ext': 'flv',
169 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
170 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
171 'duration': 3540,
172 },
173 'params': {
174 # rtmp download
175 'skip_download': True,
176 },
b1ea6802 177 'skip': 'geolocation',
25fa8d66
YCH
178 }, {
179 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
180 'info_dict': {
181 'id': 'b05zmgw1',
182 'ext': 'flv',
183 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
184 'title': 'Royal Academy Summer Exhibition',
185 'duration': 3540,
186 },
187 'params': {
188 # rtmp download
189 'skip_download': True,
190 },
b1ea6802 191 'skip': 'geolocation',
54914380
S
192 }, {
193 # iptv-all mediaset fails with geolocation however there is no geo restriction
194 # for this programme at all
5aa535c3 195 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 196 'info_dict': {
5aa535c3 197 'id': 'b06rkms3',
54914380 198 'ext': 'flv',
5aa535c3
S
199 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
200 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
201 },
202 'params': {
203 # rtmp download
204 'skip_download': True,
205 },
b1ea6802 206 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
207 }, {
208 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
209 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
210 'info_dict': {
211 'id': 'p028bfkj',
b1ea6802 212 'ext': 'flv',
1ac6e794
S
213 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
214 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 },
216 'params': {
b1ea6802 217 # rtmp download
1ac6e794
S
218 'skip_download': True,
219 },
31763975
S
220 }, {
221 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
222 'only_matching': True,
c7e67594
S
223 }, {
224 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
225 'only_matching': True,
0692ef86
S
226 }, {
227 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
228 'only_matching': True,
f20a11ed
S
229 }, {
230 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
231 'only_matching': True,
ae6986fb 232 }
2e3fd9ec
S
233 ]
234
97eb9bd2
RA
235 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
236
97067db2
S
237 def _login(self):
238 username, password = self._get_login_info()
239 if username is None:
240 return
241
242 login_page = self._download_webpage(
243 self._LOGIN_URL, None, 'Downloading signin page')
244
245 login_form = self._hidden_inputs(login_page)
246
247 login_form.update({
248 'username': username,
249 'password': password,
250 })
251
252 post_url = urljoin(self._LOGIN_URL, self._search_regex(
253 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
254 'post url', default=self._LOGIN_URL, group='url'))
255
256 response, urlh = self._download_webpage_handle(
257 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
258 headers={'Referer': self._LOGIN_URL})
259
260 if self._LOGIN_URL in urlh.geturl():
261 error = clean_html(get_element_by_class('form-message', response))
262 if error:
263 raise ExtractorError(
264 'Unable to login: %s' % error, expected=True)
265 raise ExtractorError('Unable to log in')
266
267 def _real_initialize(self):
268 self._login()
269
d12a1a47
S
270 class MediaSelectionError(Exception):
271 def __init__(self, id):
272 self.id = id
273
2e3fd9ec
S
274 def _extract_asx_playlist(self, connection, programme_id):
275 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
276 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
277
2e3fd9ec 278 def _extract_items(self, playlist):
e6174ee9
S
279 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
280
281 def _findall_ns(self, element, xpath):
282 elements = []
283 for ns in self._NAMESPACES:
284 elements.extend(element.findall(xpath % ns))
285 return elements
2e3fd9ec
S
286
287 def _extract_medias(self, media_selection):
e6174ee9
S
288 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
289 if error is None:
290 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 291 if error is not None:
d12a1a47 292 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 293 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
294
295 def _extract_connections(self, media):
e6174ee9 296 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 297
f13b1e7d 298 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
299 subtitles = {}
300 for connection in self._extract_connections(media):
301 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
302 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
303 subtitles[lang] = [
304 {
305 'url': connection.get('href'),
306 'ext': 'ttml',
307 },
f13b1e7d 308 ]
2e3fd9ec 309 return subtitles
082c6c86 310
d12a1a47
S
311 def _raise_extractor_error(self, media_selection_error):
312 raise ExtractorError(
313 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
314 expected=True)
315
c056efa2 316 def _download_media_selector(self, programme_id):
d12a1a47
S
317 last_exception = None
318 for mediaselector_url in self._MEDIASELECTOR_URLS:
319 try:
320 return self._download_media_selector_url(
321 mediaselector_url % programme_id, programme_id)
322 except BBCCoUkIE.MediaSelectionError as e:
d781e293 323 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
324 last_exception = e
325 continue
326 self._raise_extractor_error(e)
327 self._raise_extractor_error(last_exception)
9afa1770
S
328
329 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
330 try:
331 media_selection = self._download_xml(
9afa1770 332 url, programme_id, 'Downloading media selection XML')
c056efa2 333 except ExtractorError as ee:
d781e293 334 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 335 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 336 else:
c056efa2 337 raise
9afa1770 338 return self._process_media_selector(media_selection, programme_id)
082c6c86 339
9afa1770 340 def _process_media_selector(self, media_selection, programme_id):
082c6c86 341 formats = []
2e3fd9ec 342 subtitles = None
b0af1215 343 urls = []
2e3fd9ec 344
c056efa2
S
345 for media in self._extract_medias(media_selection):
346 kind = media.get('kind')
a7e5f274
RA
347 if kind in ('video', 'audio'):
348 bitrate = int_or_none(media.get('bitrate'))
349 encoding = media.get('encoding')
350 service = media.get('service')
351 width = int_or_none(media.get('width'))
352 height = int_or_none(media.get('height'))
353 file_size = int_or_none(media.get('media_file_size'))
354 for connection in self._extract_connections(media):
b0af1215
RA
355 href = connection.get('href')
356 if href in urls:
357 continue
358 if href:
359 urls.append(href)
a7e5f274
RA
360 conn_kind = connection.get('kind')
361 protocol = connection.get('protocol')
362 supplier = connection.get('supplier')
a7e5f274
RA
363 transfer_format = connection.get('transferFormat')
364 format_id = supplier or conn_kind or protocol
365 if service:
366 format_id = '%s_%s' % (service, format_id)
367 # ASX playlist
368 if supplier == 'asx':
369 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
370 formats.append({
371 'url': ref,
372 'format_id': 'ref%s_%s' % (i, format_id),
373 })
374 elif transfer_format == 'dash':
375 formats.extend(self._extract_mpd_formats(
376 href, programme_id, mpd_id=format_id, fatal=False))
377 elif transfer_format == 'hls':
378 formats.extend(self._extract_m3u8_formats(
379 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
380 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
381 if re.search(self._USP_RE, href):
382 usp_formats = self._extract_m3u8_formats(
383 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
384 programme_id, ext='mp4', entry_protocol='m3u8_native',
385 m3u8_id=format_id, fatal=False)
386 for f in usp_formats:
387 if f.get('height') and f['height'] > 720:
388 continue
389 formats.append(f)
a7e5f274
RA
390 elif transfer_format == 'hds':
391 formats.extend(self._extract_f4m_formats(
392 href, programme_id, f4m_id=format_id, fatal=False))
393 else:
f9622868 394 if not service and not supplier and bitrate:
aaa42cf0 395 format_id += '-%d' % bitrate
a7e5f274
RA
396 fmt = {
397 'format_id': format_id,
398 'filesize': file_size,
399 }
400 if kind == 'video':
401 fmt.update({
402 'width': width,
403 'height': height,
6240925b 404 'tbr': bitrate,
a7e5f274
RA
405 'vcodec': encoding,
406 })
407 else:
408 fmt.update({
409 'abr': bitrate,
410 'acodec': encoding,
411 'vcodec': 'none',
412 })
1af959ef 413 if protocol in ('http', 'https'):
a7e5f274
RA
414 # Direct link
415 fmt.update({
416 'url': href,
417 })
418 elif protocol == 'rtmp':
419 application = connection.get('application', 'ondemand')
420 auth_string = connection.get('authString')
421 identifier = connection.get('identifier')
422 server = connection.get('server')
423 fmt.update({
424 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
425 'play_path': identifier,
426 'app': '%s?%s' % (application, auth_string),
427 'page_url': 'http://www.bbc.co.uk',
428 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
429 'rtmp_live': False,
430 'ext': 'flv',
431 })
964744af
S
432 else:
433 continue
a7e5f274 434 formats.append(fmt)
c056efa2 435 elif kind == 'captions':
f13b1e7d 436 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 437 return formats, subtitles
2e3fd9ec 438
ae6986fb
S
439 def _download_playlist(self, playlist_id):
440 try:
441 playlist = self._download_json(
442 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
443 playlist_id, 'Downloading playlist JSON')
444
445 version = playlist.get('defaultAvailableVersion')
446 if version:
447 smp_config = version['smpConfig']
448 title = smp_config['title']
449 description = smp_config['summary']
450 for item in smp_config['items']:
451 kind = item['kind']
40fcba5e 452 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
453 continue
454 programme_id = item.get('vpid')
d97f5cd7 455 duration = int_or_none(item.get('duration'))
ae6986fb
S
456 formats, subtitles = self._download_media_selector(programme_id)
457 return programme_id, title, description, duration, formats, subtitles
458 except ExtractorError as ee:
f813928e 459 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
460 raise
461
462 # fallback to legacy playlist
9afa1770
S
463 return self._process_legacy_playlist(playlist_id)
464
465 def _process_legacy_playlist_url(self, url, display_id):
466 playlist = self._download_legacy_playlist_url(url, display_id)
467 return self._extract_from_legacy_playlist(playlist, display_id)
468
469 def _process_legacy_playlist(self, playlist_id):
470 return self._process_legacy_playlist_url(
471 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
472
473 def _download_legacy_playlist_url(self, url, playlist_id=None):
474 return self._download_xml(
475 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 476
9afa1770 477 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 478 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
479 if no_items is not None:
480 reason = no_items.get('reason')
481 if reason == 'preAvailability':
482 msg = 'Episode %s is not yet available' % playlist_id
483 elif reason == 'postAvailability':
484 msg = 'Episode %s is no longer available' % playlist_id
485 elif reason == 'noMedia':
486 msg = 'Episode %s is not currently available' % playlist_id
487 else:
488 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
489 raise ExtractorError(msg, expected=True)
490
491 for item in self._extract_items(playlist):
492 kind = item.get('kind')
40fcba5e 493 if kind not in ('programme', 'radioProgramme'):
ae6986fb 494 continue
e6174ee9
S
495 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
496 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 497 description = description_el.text if description_el is not None else None
9afa1770
S
498
499 def get_programme_id(item):
500 def get_from_attributes(item):
501 for p in('identifier', 'group'):
502 value = item.get(p)
503 if value and re.match(r'^[pb][\da-z]{7}$', value):
504 return value
505 get_from_attributes(item)
e6174ee9 506 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
507 if mediator is not None:
508 return get_from_attributes(mediator)
509
510 programme_id = get_programme_id(item)
d97f5cd7 511 duration = int_or_none(item.get('duration'))
e6174ee9
S
512
513 if programme_id:
514 formats, subtitles = self._download_media_selector(programme_id)
515 else:
516 formats, subtitles = self._process_media_selector(item, playlist_id)
517 programme_id = playlist_id
ae6986fb
S
518
519 return programme_id, title, description, duration, formats, subtitles
520
c056efa2
S
521 def _real_extract(self, url):
522 group_id = self._match_id(url)
523
524 webpage = self._download_webpage(url, group_id, 'Downloading video page')
525
8683b4d8 526 programme_id = None
679bacf0 527 duration = None
8683b4d8
S
528
529 tviplayer = self._search_regex(
530 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
531 webpage, 'player', default=None)
532
533 if tviplayer:
534 player = self._parse_json(tviplayer, group_id).get('player', {})
535 duration = int_or_none(player.get('duration'))
536 programme_id = player.get('vpid')
537
538 if not programme_id:
539 programme_id = self._search_regex(
22d7368d 540 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 541
c056efa2 542 if programme_id:
c056efa2 543 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 544 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
545 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
546 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 547 description = self._search_regex(
a8534274
S
548 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
549 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
550 webpage, 'description', default=None)
551 if not description:
552 description = self._html_search_meta('description', webpage)
c056efa2 553 else:
ae6986fb 554 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 555
082c6c86
S
556 self._sort_formats(formats)
557
558 return {
2e3fd9ec 559 'id': programme_id,
082c6c86
S
560 'title': title,
561 'description': description,
650cfd0c 562 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
563 'duration': duration,
564 'formats': formats,
2e3fd9ec 565 'subtitles': subtitles,
5f6a1245 566 }
10273d6e 567
568
9afa1770
S
569class BBCIE(BBCCoUkIE):
570 IE_NAME = 'bbc'
571 IE_DESC = 'BBC'
572 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 573
d12a1a47 574 _MEDIASELECTOR_URLS = [
55ebae26
S
575 # Provides HQ HLS streams but fails with geolocation in some cases when it's
576 # even not geo restricted at all
577 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
578 # Provides more formats, namely direct mp4 links, but fails on some videos with
579 # notukerror for non UK (?) users (e.g.
580 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
581 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
582 # Provides fewer formats, but works everywhere for everybody (hopefully)
583 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
584 ]
10273d6e 585
586 _TESTS = [{
6a747190 587 # article with multiple videos embedded with data-playable containing vpids
10273d6e 588 'url': 'http://www.bbc.com/news/world-europe-32668511',
589 'info_dict': {
590 'id': 'world-europe-32668511',
591 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 592 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 593 },
594 'playlist_count': 2,
a3bfddfa 595 }, {
6a747190 596 # article with multiple videos embedded with data-playable (more videos)
10273d6e 597 'url': 'http://www.bbc.com/news/business-28299555',
598 'info_dict': {
599 'id': 'business-28299555',
600 'title': 'Farnborough Airshow: Video highlights',
9afa1770 601 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 602 },
603 'playlist_count': 9,
9afa1770 604 'skip': 'Save time',
88ed52ae
S
605 }, {
606 # article with multiple videos embedded with `new SMP()`
6a747190 607 # broken
88ed52ae
S
608 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
609 'info_dict': {
610 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 611 'title': 'BUGGER',
88ed52ae
S
612 },
613 'playlist_count': 18,
a3bfddfa 614 }, {
6a747190 615 # single video embedded with data-playable containing vpid
10273d6e 616 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 617 'info_dict': {
618 'id': 'p02mprgb',
55ebae26 619 'ext': 'mp4',
10273d6e 620 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 621 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 622 'duration': 47,
9afa1770 623 'timestamp': 1427219242,
da92eeae 624 'upload_date': '20150324',
10273d6e 625 },
626 'params': {
9afa1770 627 # rtmp download
10273d6e 628 'skip_download': True,
629 }
a3bfddfa 630 }, {
6a747190
S
631 # article with single video embedded with data-playable containing XML playlist
632 # with direct video links as progressiveDownloadUrl (for now these are extracted)
633 # and playlist with f4m and m3u8 as streamingUrl
de939d89 634 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 635 'info_dict': {
9afa1770 636 'id': '150615_telabyad_kentin_cogu',
de939d89 637 'ext': 'mp4',
ad152e2d 638 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 639 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 640 'timestamp': 1434397334,
da92eeae 641 'upload_date': '20150615',
de939d89 642 },
643 'params': {
644 'skip_download': True,
645 }
c936d8cc 646 }, {
6a747190 647 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 648 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 649 'info_dict': {
9afa1770 650 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 651 'ext': 'mp4',
9afa1770 652 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 653 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 654 'timestamp': 1434713142,
da92eeae 655 'upload_date': '20150619',
de939d89 656 },
657 'params': {
658 'skip_download': True,
659 }
a346b1ff
S
660 }, {
661 # single video from video playlist embedded with vxp-playlist-data JSON
662 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
663 'info_dict': {
664 'id': 'p02w6qjc',
55ebae26 665 'ext': 'mp4',
a346b1ff
S
666 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
667 'duration': 56,
0bc4ee60 668 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
669 },
670 'params': {
671 'skip_download': True,
672 }
9afa1770
S
673 }, {
674 # single video story with digitalData
675 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
676 'info_dict': {
677 'id': 'p02q6gc4',
678 'ext': 'flv',
679 'title': 'Sri Lanka’s spicy secret',
680 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
681 'timestamp': 1437674293,
682 'upload_date': '20150723',
683 },
684 'params': {
685 # rtmp download
686 'skip_download': True,
687 }
688 }, {
689 # single video story without digitalData
690 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
691 'info_dict': {
692 'id': 'p018zqqg',
55ebae26 693 'ext': 'mp4',
9afa1770
S
694 'title': 'Hyundai Santa Fe Sport: Rock star',
695 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
696 'timestamp': 1415867444,
697 'upload_date': '20141113',
9afa1770
S
698 },
699 'params': {
700 # rtmp download
701 'skip_download': True,
702 }
9fb64c04
S
703 }, {
704 # single video embedded with Morph
705 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
706 'info_dict': {
707 'id': 'p041vhd0',
708 'ext': 'mp4',
709 'title': "Nigeria v Japan - Men's First Round",
710 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
711 'duration': 7980,
712 'uploader': 'BBC Sport',
713 'uploader_id': 'bbc_sport',
714 },
715 'params': {
716 # m3u8 download
717 'skip_download': True,
9fb64c04
S
718 },
719 'skip': 'Georestricted to UK',
9afa1770 720 }, {
6a747190 721 # single video with playlist.sxml URL in playlist param
9afa1770
S
722 'url': 'http://www.bbc.com/sport/0/football/33653409',
723 'info_dict': {
724 'id': 'p02xycnp',
55ebae26 725 'ext': 'mp4',
9afa1770 726 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 727 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
728 'duration': 140,
729 },
730 'params': {
731 # rtmp download
732 'skip_download': True,
733 }
b5d48cb1 734 }, {
6a747190 735 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
736 'url': 'http://www.bbc.com/sport/0/football/34475836',
737 'info_dict': {
738 'id': '34475836',
450b233c 739 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 740 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
741 },
742 'playlist_count': 3,
450b233c
S
743 }, {
744 # school report article with single video
745 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
746 'info_dict': {
747 'id': '35744779',
748 'title': 'School which breaks down barriers in Jerusalem',
749 },
750 'playlist_count': 1,
9afa1770
S
751 }, {
752 # single video with playlist URL from weather section
753 'url': 'http://www.bbc.com/weather/features/33601775',
754 'only_matching': True,
755 }, {
756 # custom redirection to www.bbc.com
757 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
758 'only_matching': True,
a1cf3e38
S
759 }, {
760 # single video article embedded with data-media-vpid
761 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
762 'only_matching': True,
10273d6e 763 }]
764
9afa1770
S
765 @classmethod
766 def suitable(cls, url):
ded7511a
S
767 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
768 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
769 else super(BBCIE, cls).suitable(url))
9afa1770
S
770
771 def _extract_from_media_meta(self, media_meta, video_id):
772 # Direct links to media in media metadata (e.g.
773 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
774 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
775 source_files = media_meta.get('sourceFiles')
776 if source_files:
777 return [{
778 'url': f['url'],
779 'format_id': format_id,
780 'ext': f.get('encoding'),
781 'tbr': float_or_none(f.get('bitrate'), 1000),
782 'filesize': int_or_none(f.get('filesize')),
783 } for format_id, f in source_files.items() if f.get('url')], []
784
785 programme_id = media_meta.get('externalId')
786 if programme_id:
787 return self._download_media_selector(programme_id)
788
789 # Process playlist.sxml as legacy playlist
790 href = media_meta.get('href')
791 if href:
792 playlist = self._download_legacy_playlist_url(href)
793 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
794 return formats, subtitles
795
796 return [], []
797
baf39a1a
S
798 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
799 programme_id, title, description, duration, formats, subtitles = \
800 self._process_legacy_playlist_url(url, playlist_id)
801 self._sort_formats(formats)
802 return {
803 'id': programme_id,
804 'title': title,
805 'description': description,
806 'duration': duration,
807 'timestamp': timestamp,
808 'formats': formats,
809 'subtitles': subtitles,
810 }
811
10273d6e 812 def _real_extract(self, url):
9afa1770
S
813 playlist_id = self._match_id(url)
814
815 webpage = self._download_webpage(url, playlist_id)
816
522f6c06 817 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 818 timestamp = json_ld_info.get('timestamp')
0e832c2c 819
350e02d4 820 playlist_title = json_ld_info.get('title')
0e832c2c
S
821 if not playlist_title:
822 playlist_title = self._og_search_title(
823 webpage, default=None) or self._html_search_regex(
824 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
825 if playlist_title:
826 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
827
828 playlist_description = json_ld_info.get(
829 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
830
831 if not timestamp:
832 timestamp = parse_iso8601(self._search_regex(
833 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
834 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 835 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 836 webpage, 'date', default=None))
9afa1770 837
78f9d843
S
838 entries = []
839
de665713
S
840 # article with multiple videos embedded with playlist.sxml (e.g.
841 # http://www.bbc.com/sport/0/football/34475836)
842 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 843 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 844 if playlists:
baf39a1a
S
845 entries = [
846 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
847 for playlist_url in playlists]
de939d89 848
78f9d843
S
849 # news article with multiple videos embedded with data-playable
850 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
851 if data_playables:
852 for _, data_playable_json in data_playables:
853 data_playable = self._parse_json(
854 unescapeHTML(data_playable_json), playlist_id, fatal=False)
855 if not data_playable:
856 continue
baf39a1a
S
857 settings = data_playable.get('settings', {})
858 if settings:
78f9d843
S
859 # data-playable with video vpid in settings.playlistObject.items (e.g.
860 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
861 playlist_object = settings.get('playlistObject', {})
862 if playlist_object:
863 items = playlist_object.get('items')
864 if items and isinstance(items, list):
78f9d843
S
865 title = playlist_object['title']
866 description = playlist_object.get('summary')
baf39a1a
S
867 duration = int_or_none(items[0].get('duration'))
868 programme_id = items[0].get('vpid')
78f9d843
S
869 formats, subtitles = self._download_media_selector(programme_id)
870 self._sort_formats(formats)
871 entries.append({
872 'id': programme_id,
873 'title': title,
874 'description': description,
875 'timestamp': timestamp,
876 'duration': duration,
877 'formats': formats,
878 'subtitles': subtitles,
879 })
880 else:
881 # data-playable without vpid but with a playlist.sxml URLs
882 # in otherSettings.playlist (e.g.
883 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
884 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
885 if playlist:
a7e5f274
RA
886 entry = None
887 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
888 playlist_url = playlist.get('%sUrl' % key)
889 if not playlist_url:
890 continue
891 try:
a7e5f274
RA
892 info = self._extract_from_playlist_sxml(
893 playlist_url, playlist_id, timestamp)
894 if not entry:
895 entry = info
896 else:
897 entry['title'] = info['title']
898 entry['formats'].extend(info['formats'])
05087d1b
S
899 except Exception as e:
900 # Some playlist URL may fail with 500, at the same time
901 # the other one may work fine (e.g.
902 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
903 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
904 continue
905 raise
a7e5f274
RA
906 if entry:
907 self._sort_formats(entry['formats'])
908 entries.append(entry)
78f9d843
S
909
910 if entries:
78f9d843
S
911 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
912
913 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
914 programme_id = self._search_regex(
a1cf3e38 915 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
916 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
917 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 918 webpage, 'vpid', default=None)
dab062fb 919
9afa1770
S
920 if programme_id:
921 formats, subtitles = self._download_media_selector(programme_id)
922 self._sort_formats(formats)
923 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
924 digital_data = self._parse_json(
925 self._search_regex(
926 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
927 programme_id, fatal=False)
928 page_info = digital_data.get('page', {}).get('pageInfo', {})
929 title = page_info.get('pageName') or self._og_search_title(webpage)
930 description = page_info.get('description') or self._og_search_description(webpage)
931 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
932 return {
933 'id': programme_id,
934 'title': title,
935 'description': description,
936 'timestamp': timestamp,
937 'formats': formats,
938 'subtitles': subtitles,
939 }
a3bfddfa 940
9fb64c04
S
941 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
942 # There are several setPayload calls may be present but the video
943 # seems to be always related to the first one
944 morph_payload = self._parse_json(
945 self._search_regex(
946 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
947 webpage, 'morph payload', default='{}'),
948 playlist_id, fatal=False)
949 if morph_payload:
950 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
951 for component in components:
952 if not isinstance(component, dict):
953 continue
954 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
955 if not lead_media:
956 continue
957 identifiers = lead_media.get('identifiers')
958 if not identifiers or not isinstance(identifiers, dict):
959 continue
960 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
961 if not programme_id:
962 continue
963 title = lead_media.get('title') or self._og_search_title(webpage)
964 formats, subtitles = self._download_media_selector(programme_id)
965 self._sort_formats(formats)
966 description = lead_media.get('summary')
967 uploader = lead_media.get('masterBrand')
968 uploader_id = lead_media.get('mid')
969 duration = None
970 duration_d = lead_media.get('duration')
971 if isinstance(duration_d, dict):
972 duration = parse_duration(dict_get(
973 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
974 return {
975 'id': programme_id,
976 'title': title,
977 'description': description,
978 'duration': duration,
979 'uploader': uploader,
980 'uploader_id': uploader_id,
981 'formats': formats,
982 'subtitles': subtitles,
983 }
984
88ed52ae
S
985 def extract_all(pattern):
986 return list(filter(None, map(
987 lambda s: self._parse_json(s, playlist_id, fatal=False),
988 re.findall(pattern, webpage))))
989
990 # Multiple video article (e.g.
991 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 992 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
993 entries = []
994 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
995 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
996 if embed_url and re.match(EMBED_URL, embed_url):
997 entries.append(embed_url)
998 entries.extend(re.findall(
999 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1000 if entries:
1001 return self.playlist_result(
aaa42cf0 1002 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1003 playlist_id, playlist_title, playlist_description)
9afa1770
S
1004
1005 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1006 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1007
1008 if not medias:
1009 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1010 media_asset = self._search_regex(
1011 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1012 webpage, 'media asset', default=None)
1013 if media_asset:
1014 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1015 medias = []
1016 for video in media_asset_page.get('videos', {}).values():
1017 medias.extend(video.values())
1018
1019 if not medias:
1020 # Multiple video playlist with single `now playing` entry (e.g.
1021 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1022 vxp_playlist = self._parse_json(
9afa1770 1023 self._search_regex(
a346b1ff
S
1024 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1025 webpage, 'playlist data'),
9afa1770 1026 playlist_id)
a346b1ff
S
1027 playlist_medias = []
1028 for item in vxp_playlist:
1029 media = item.get('media')
1030 if not media:
1031 continue
1032 playlist_medias.append(media)
1033 # Download single video if found media with asset id matching the video id from URL
1034 if item.get('advert', {}).get('assetId') == playlist_id:
1035 medias = [media]
1036 break
1037 # Fallback to the whole playlist
1038 if not medias:
1039 medias = playlist_medias
9afa1770
S
1040
1041 entries = []
1042 for num, media_meta in enumerate(medias, start=1):
1043 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1044 if not formats:
1045 continue
10273d6e 1046 self._sort_formats(formats)
1047
9afa1770
S
1048 video_id = media_meta.get('externalId')
1049 if not video_id:
1050 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1051
1052 title = media_meta.get('caption')
1053 if not title:
1054 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1055
1056 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1057
9afa1770
S
1058 images = []
1059 for image in media_meta.get('images', {}).values():
1060 images.extend(image.values())
1061 if 'image' in media_meta:
1062 images.append(media_meta['image'])
1063
1064 thumbnails = [{
1065 'url': image.get('href'),
1066 'width': int_or_none(image.get('width')),
1067 'height': int_or_none(image.get('height')),
1068 } for image in images]
1069
1070 entries.append({
1071 'id': video_id,
10273d6e 1072 'title': title,
9afa1770 1073 'thumbnails': thumbnails,
10273d6e 1074 'duration': duration,
9afa1770 1075 'timestamp': timestamp,
10273d6e 1076 'formats': formats,
1077 'subtitles': subtitles,
a3bfddfa 1078 })
10273d6e 1079
9afa1770 1080 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1081
1082
1083class BBCCoUkArticleIE(InfoExtractor):
92519402 1084 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1085 IE_NAME = 'bbc.co.uk:article'
1086 IE_DESC = 'BBC articles'
1087
1088 _TEST = {
1089 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1090 'info_dict': {
1091 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1092 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1093 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1094 },
1095 'playlist_count': 4,
1096 'add_ie': ['BBCCoUk'],
1097 }
1098
1099 def _real_extract(self, url):
1100 playlist_id = self._match_id(url)
1101
1102 webpage = self._download_webpage(url, playlist_id)
1103
1104 title = self._og_search_title(webpage)
1105 description = self._og_search_description(webpage).strip()
1106
1107 entries = [self.url_result(programme_url) for programme_url in re.findall(
1108 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1109
1110 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1111
1112
1113class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1114 def _entries(self, webpage, url, playlist_id):
1115 single_page = 'page' in compat_urlparse.parse_qs(
1116 compat_urlparse.urlparse(url).query)
1117 for page_num in itertools.count(2):
1118 for video_id in re.findall(
1119 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1120 yield self.url_result(
1121 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1122 if single_page:
1123 return
1124 next_page = self._search_regex(
1125 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1126 webpage, 'next page url', default=None, group='url')
1127 if not next_page:
1128 break
1129 webpage = self._download_webpage(
1130 compat_urlparse.urljoin(url, next_page), playlist_id,
1131 'Downloading page %d' % page_num, page_num)
1132
ded7511a
S
1133 def _real_extract(self, url):
1134 playlist_id = self._match_id(url)
1135
1136 webpage = self._download_webpage(url, playlist_id)
1137
ded7511a
S
1138 title, description = self._extract_title_and_description(webpage)
1139
254e64a2
S
1140 return self.playlist_result(
1141 self._entries(webpage, url, playlist_id),
1142 playlist_id, title, description)
ded7511a
S
1143
1144
1145class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1146 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1147 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1148 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1149 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1150 _TESTS = [{
ded7511a
S
1151 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1152 'info_dict': {
1153 'id': 'b05rcz9v',
1154 'title': 'The Disappearance',
1155 'description': 'French thriller serial about a missing teenager.',
1156 },
1157 'playlist_mincount': 6,
c6668e4a 1158 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1159 }, {
1160 # Available for over a year unlike 30 days for most other programmes
1161 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1162 'info_dict': {
1163 'id': 'p02tcc32',
1164 'title': 'Bohemian Icons',
1165 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1166 },
1167 'playlist_mincount': 10,
1168 }]
ded7511a
S
1169
1170 def _extract_title_and_description(self, webpage):
1171 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1172 description = self._search_regex(
1173 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1174 webpage, 'description', fatal=False, group='value')
1175 return title, description
1176
1177
1178class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1179 IE_NAME = 'bbc.co.uk:playlist'
1180 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1181 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1182 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1183 _TESTS = [{
1184 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1185 'info_dict': {
1186 'id': 'b05rcz9v',
1187 'title': 'The Disappearance - Clips - BBC Four',
1188 'description': 'French thriller serial about a missing teenager.',
1189 },
1190 'playlist_mincount': 7,
4f640f28
S
1191 }, {
1192 # multipage playlist, explicit page
1193 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1194 'info_dict': {
1195 'id': 'b00mfl7n',
1196 'title': 'Frozen Planet - Clips - BBC One',
1197 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1198 },
1199 'playlist_mincount': 24,
1200 }, {
1201 # multipage playlist, all pages
1202 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1203 'info_dict': {
1204 'id': 'b00mfl7n',
1205 'title': 'Frozen Planet - Clips - BBC One',
1206 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1207 },
1208 'playlist_mincount': 142,
ded7511a
S
1209 }, {
1210 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1211 'only_matching': True,
1212 }, {
1213 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1214 'only_matching': True,
1215 }, {
1216 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1217 'only_matching': True,
1218 }]
1219
1220 def _extract_title_and_description(self, webpage):
1221 title = self._og_search_title(webpage, fatal=False)
1222 description = self._og_search_description(webpage)
1223 return title, description