]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
Merge pull request #14048 from ryandesign/patch-1
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
9afa1770
S
15 parse_duration,
16 parse_iso8601,
9fb64c04 17 try_get,
dab062fb 18 unescapeHTML,
97067db2
S
19 urlencode_postdata,
20 urljoin,
8683b4d8 21)
36e6f62c
JMF
22from ..compat import (
23 compat_etree_fromstring,
24 compat_HTTPError,
254e64a2 25 compat_urlparse,
36e6f62c 26)
082c6c86 27
d12a1a47 28
f13b1e7d 29class BBCCoUkIE(InfoExtractor):
082c6c86 30 IE_NAME = 'bbc.co.uk'
2e3fd9ec 31 IE_DESC = 'BBC iPlayer'
22d7368d 32 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:www\.)?bbc\.co\.uk/
36 (?:
37 programmes/(?!articles/)|
38 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 39 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
40 radio/player/|
41 events/[^/]+/play/[^/]+/
f20a11ed 42 )
ded7511a 43 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 44 ''' % _ID_REGEX
082c6c86 45
97067db2
S
46 _LOGIN_URL = 'https://account.bbc.com/signin'
47 _NETRC_MACHINE = 'bbc'
48
d12a1a47 49 _MEDIASELECTOR_URLS = [
26ccc68b
S
50 # Provides HQ HLS streams with even better quality that pc mediaset but fails
51 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 52 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 53 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
54 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
55 ]
a8b081a0 56
e6174ee9
S
57 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
58 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
59
60 _NAMESPACES = (
61 _MEDIASELECTION_NS,
62 _EMP_PLAYLIST_NS,
63 )
64
2e3fd9ec
S
65 _TESTS = [
66 {
f2d0fc68 67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 68 'info_dict': {
f2d0fc68 69 'id': 'b039d07m',
b1ea6802 70 'ext': 'flv',
679bacf0 71 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
73 },
74 'params': {
b1ea6802 75 # rtmp download
2e3fd9ec
S
76 'skip_download': True,
77 }
082c6c86 78 },
2e3fd9ec
S
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
c7f0177f
S
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
17968e44 99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 101 'duration': 5100,
2e3fd9ec
S
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
b1ea6802 107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
b1ea6802 122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
a3ef0e1c
YCH
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 137 }, {
5aa535c3 138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
139 'note': 'Audio',
140 'info_dict': {
5aa535c3 141 'id': 'p022h44j',
b1ea6802 142 'ext': 'flv',
5aa535c3
S
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
c7e67594
S
146 },
147 'params': {
b1ea6802 148 # rtmp download
c7e67594
S
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
b1ea6802 156 'ext': 'flv',
c7e67594
S
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
b1ea6802 162 # rtmp download
c7e67594
S
163 'skip_download': True,
164 }
e68ae99a
S
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
b1ea6802 178 'skip': 'geolocation',
25fa8d66
YCH
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
b1ea6802 192 'skip': 'geolocation',
54914380
S
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
5aa535c3 196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 197 'info_dict': {
5aa535c3 198 'id': 'b06rkms3',
54914380 199 'ext': 'flv',
5aa535c3
S
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
b1ea6802 207 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
208 }, {
209 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
b1ea6802 213 'ext': 'flv',
1ac6e794
S
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
b1ea6802 218 # rtmp download
1ac6e794
S
219 'skip_download': True,
220 },
31763975
S
221 }, {
222 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
223 'only_matching': True,
c7e67594
S
224 }, {
225 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
226 'only_matching': True,
0692ef86
S
227 }, {
228 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
229 'only_matching': True,
f20a11ed
S
230 }, {
231 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
232 'only_matching': True,
72d256c4
S
233 }, {
234 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
235 'only_matching': True,
236 }]
2e3fd9ec 237
97eb9bd2
RA
238 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
239
97067db2
S
240 def _login(self):
241 username, password = self._get_login_info()
242 if username is None:
243 return
244
245 login_page = self._download_webpage(
246 self._LOGIN_URL, None, 'Downloading signin page')
247
248 login_form = self._hidden_inputs(login_page)
249
250 login_form.update({
251 'username': username,
252 'password': password,
253 })
254
255 post_url = urljoin(self._LOGIN_URL, self._search_regex(
256 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
257 'post url', default=self._LOGIN_URL, group='url'))
258
259 response, urlh = self._download_webpage_handle(
260 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
261 headers={'Referer': self._LOGIN_URL})
262
263 if self._LOGIN_URL in urlh.geturl():
264 error = clean_html(get_element_by_class('form-message', response))
265 if error:
266 raise ExtractorError(
267 'Unable to login: %s' % error, expected=True)
268 raise ExtractorError('Unable to log in')
269
270 def _real_initialize(self):
271 self._login()
272
d12a1a47
S
273 class MediaSelectionError(Exception):
274 def __init__(self, id):
275 self.id = id
276
2e3fd9ec
S
277 def _extract_asx_playlist(self, connection, programme_id):
278 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
279 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
280
2e3fd9ec 281 def _extract_items(self, playlist):
e6174ee9
S
282 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
283
284 def _findall_ns(self, element, xpath):
285 elements = []
286 for ns in self._NAMESPACES:
287 elements.extend(element.findall(xpath % ns))
288 return elements
2e3fd9ec
S
289
290 def _extract_medias(self, media_selection):
e6174ee9
S
291 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
292 if error is None:
293 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 294 if error is not None:
d12a1a47 295 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 296 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
297
298 def _extract_connections(self, media):
e6174ee9 299 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 300
f13b1e7d 301 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
302 subtitles = {}
303 for connection in self._extract_connections(media):
304 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
305 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
306 subtitles[lang] = [
307 {
308 'url': connection.get('href'),
309 'ext': 'ttml',
310 },
f13b1e7d 311 ]
2e3fd9ec 312 return subtitles
082c6c86 313
d12a1a47
S
314 def _raise_extractor_error(self, media_selection_error):
315 raise ExtractorError(
316 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
317 expected=True)
318
c056efa2 319 def _download_media_selector(self, programme_id):
d12a1a47
S
320 last_exception = None
321 for mediaselector_url in self._MEDIASELECTOR_URLS:
322 try:
323 return self._download_media_selector_url(
324 mediaselector_url % programme_id, programme_id)
325 except BBCCoUkIE.MediaSelectionError as e:
d781e293 326 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
327 last_exception = e
328 continue
329 self._raise_extractor_error(e)
330 self._raise_extractor_error(last_exception)
9afa1770
S
331
332 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
333 try:
334 media_selection = self._download_xml(
9afa1770 335 url, programme_id, 'Downloading media selection XML')
c056efa2 336 except ExtractorError as ee:
d781e293 337 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 338 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 339 else:
c056efa2 340 raise
9afa1770 341 return self._process_media_selector(media_selection, programme_id)
082c6c86 342
9afa1770 343 def _process_media_selector(self, media_selection, programme_id):
082c6c86 344 formats = []
2e3fd9ec 345 subtitles = None
b0af1215 346 urls = []
2e3fd9ec 347
c056efa2
S
348 for media in self._extract_medias(media_selection):
349 kind = media.get('kind')
a7e5f274
RA
350 if kind in ('video', 'audio'):
351 bitrate = int_or_none(media.get('bitrate'))
352 encoding = media.get('encoding')
353 service = media.get('service')
354 width = int_or_none(media.get('width'))
355 height = int_or_none(media.get('height'))
356 file_size = int_or_none(media.get('media_file_size'))
357 for connection in self._extract_connections(media):
b0af1215
RA
358 href = connection.get('href')
359 if href in urls:
360 continue
361 if href:
362 urls.append(href)
a7e5f274
RA
363 conn_kind = connection.get('kind')
364 protocol = connection.get('protocol')
365 supplier = connection.get('supplier')
a7e5f274
RA
366 transfer_format = connection.get('transferFormat')
367 format_id = supplier or conn_kind or protocol
368 if service:
369 format_id = '%s_%s' % (service, format_id)
370 # ASX playlist
371 if supplier == 'asx':
372 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
373 formats.append({
374 'url': ref,
375 'format_id': 'ref%s_%s' % (i, format_id),
376 })
377 elif transfer_format == 'dash':
378 formats.extend(self._extract_mpd_formats(
379 href, programme_id, mpd_id=format_id, fatal=False))
380 elif transfer_format == 'hls':
381 formats.extend(self._extract_m3u8_formats(
382 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
383 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
384 if re.search(self._USP_RE, href):
385 usp_formats = self._extract_m3u8_formats(
386 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
387 programme_id, ext='mp4', entry_protocol='m3u8_native',
388 m3u8_id=format_id, fatal=False)
389 for f in usp_formats:
390 if f.get('height') and f['height'] > 720:
391 continue
392 formats.append(f)
a7e5f274
RA
393 elif transfer_format == 'hds':
394 formats.extend(self._extract_f4m_formats(
395 href, programme_id, f4m_id=format_id, fatal=False))
396 else:
f9622868 397 if not service and not supplier and bitrate:
aaa42cf0 398 format_id += '-%d' % bitrate
a7e5f274
RA
399 fmt = {
400 'format_id': format_id,
401 'filesize': file_size,
402 }
403 if kind == 'video':
404 fmt.update({
405 'width': width,
406 'height': height,
6240925b 407 'tbr': bitrate,
a7e5f274
RA
408 'vcodec': encoding,
409 })
410 else:
411 fmt.update({
412 'abr': bitrate,
413 'acodec': encoding,
414 'vcodec': 'none',
415 })
1af959ef 416 if protocol in ('http', 'https'):
a7e5f274
RA
417 # Direct link
418 fmt.update({
419 'url': href,
420 })
421 elif protocol == 'rtmp':
422 application = connection.get('application', 'ondemand')
423 auth_string = connection.get('authString')
424 identifier = connection.get('identifier')
425 server = connection.get('server')
426 fmt.update({
427 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
428 'play_path': identifier,
429 'app': '%s?%s' % (application, auth_string),
430 'page_url': 'http://www.bbc.co.uk',
431 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
432 'rtmp_live': False,
433 'ext': 'flv',
434 })
964744af
S
435 else:
436 continue
a7e5f274 437 formats.append(fmt)
c056efa2 438 elif kind == 'captions':
f13b1e7d 439 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 440 return formats, subtitles
2e3fd9ec 441
ae6986fb
S
442 def _download_playlist(self, playlist_id):
443 try:
444 playlist = self._download_json(
445 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
446 playlist_id, 'Downloading playlist JSON')
447
448 version = playlist.get('defaultAvailableVersion')
449 if version:
450 smp_config = version['smpConfig']
451 title = smp_config['title']
452 description = smp_config['summary']
453 for item in smp_config['items']:
454 kind = item['kind']
40fcba5e 455 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
456 continue
457 programme_id = item.get('vpid')
d97f5cd7 458 duration = int_or_none(item.get('duration'))
ae6986fb
S
459 formats, subtitles = self._download_media_selector(programme_id)
460 return programme_id, title, description, duration, formats, subtitles
461 except ExtractorError as ee:
f813928e 462 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
463 raise
464
465 # fallback to legacy playlist
9afa1770
S
466 return self._process_legacy_playlist(playlist_id)
467
468 def _process_legacy_playlist_url(self, url, display_id):
469 playlist = self._download_legacy_playlist_url(url, display_id)
470 return self._extract_from_legacy_playlist(playlist, display_id)
471
472 def _process_legacy_playlist(self, playlist_id):
473 return self._process_legacy_playlist_url(
474 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
475
476 def _download_legacy_playlist_url(self, url, playlist_id=None):
477 return self._download_xml(
478 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 479
9afa1770 480 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 481 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
482 if no_items is not None:
483 reason = no_items.get('reason')
484 if reason == 'preAvailability':
485 msg = 'Episode %s is not yet available' % playlist_id
486 elif reason == 'postAvailability':
487 msg = 'Episode %s is no longer available' % playlist_id
488 elif reason == 'noMedia':
489 msg = 'Episode %s is not currently available' % playlist_id
490 else:
491 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
492 raise ExtractorError(msg, expected=True)
493
494 for item in self._extract_items(playlist):
495 kind = item.get('kind')
40fcba5e 496 if kind not in ('programme', 'radioProgramme'):
ae6986fb 497 continue
e6174ee9
S
498 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
499 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 500 description = description_el.text if description_el is not None else None
9afa1770
S
501
502 def get_programme_id(item):
503 def get_from_attributes(item):
504 for p in('identifier', 'group'):
505 value = item.get(p)
506 if value and re.match(r'^[pb][\da-z]{7}$', value):
507 return value
508 get_from_attributes(item)
e6174ee9 509 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
510 if mediator is not None:
511 return get_from_attributes(mediator)
512
513 programme_id = get_programme_id(item)
d97f5cd7 514 duration = int_or_none(item.get('duration'))
e6174ee9
S
515
516 if programme_id:
517 formats, subtitles = self._download_media_selector(programme_id)
518 else:
519 formats, subtitles = self._process_media_selector(item, playlist_id)
520 programme_id = playlist_id
ae6986fb
S
521
522 return programme_id, title, description, duration, formats, subtitles
523
c056efa2
S
524 def _real_extract(self, url):
525 group_id = self._match_id(url)
526
527 webpage = self._download_webpage(url, group_id, 'Downloading video page')
528
b2ed954f
S
529 error = self._search_regex(
530 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
531 webpage, 'error', default=None)
532 if error:
533 raise ExtractorError(error, expected=True)
534
8683b4d8 535 programme_id = None
679bacf0 536 duration = None
8683b4d8
S
537
538 tviplayer = self._search_regex(
539 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
540 webpage, 'player', default=None)
541
542 if tviplayer:
543 player = self._parse_json(tviplayer, group_id).get('player', {})
544 duration = int_or_none(player.get('duration'))
545 programme_id = player.get('vpid')
546
547 if not programme_id:
548 programme_id = self._search_regex(
22d7368d 549 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 550
c056efa2 551 if programme_id:
c056efa2 552 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 553 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
554 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
555 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 556 description = self._search_regex(
a8534274
S
557 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
558 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
559 webpage, 'description', default=None)
560 if not description:
561 description = self._html_search_meta('description', webpage)
c056efa2 562 else:
ae6986fb 563 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 564
082c6c86
S
565 self._sort_formats(formats)
566
567 return {
2e3fd9ec 568 'id': programme_id,
082c6c86
S
569 'title': title,
570 'description': description,
650cfd0c 571 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
572 'duration': duration,
573 'formats': formats,
2e3fd9ec 574 'subtitles': subtitles,
5f6a1245 575 }
10273d6e 576
577
9afa1770
S
578class BBCIE(BBCCoUkIE):
579 IE_NAME = 'bbc'
580 IE_DESC = 'BBC'
581 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 582
d12a1a47 583 _MEDIASELECTOR_URLS = [
55ebae26
S
584 # Provides HQ HLS streams but fails with geolocation in some cases when it's
585 # even not geo restricted at all
586 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
587 # Provides more formats, namely direct mp4 links, but fails on some videos with
588 # notukerror for non UK (?) users (e.g.
589 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
590 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
591 # Provides fewer formats, but works everywhere for everybody (hopefully)
592 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
593 ]
10273d6e 594
595 _TESTS = [{
6a747190 596 # article with multiple videos embedded with data-playable containing vpids
10273d6e 597 'url': 'http://www.bbc.com/news/world-europe-32668511',
598 'info_dict': {
599 'id': 'world-europe-32668511',
600 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 601 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 602 },
603 'playlist_count': 2,
a3bfddfa 604 }, {
6a747190 605 # article with multiple videos embedded with data-playable (more videos)
10273d6e 606 'url': 'http://www.bbc.com/news/business-28299555',
607 'info_dict': {
608 'id': 'business-28299555',
609 'title': 'Farnborough Airshow: Video highlights',
9afa1770 610 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 611 },
612 'playlist_count': 9,
9afa1770 613 'skip': 'Save time',
88ed52ae
S
614 }, {
615 # article with multiple videos embedded with `new SMP()`
6a747190 616 # broken
88ed52ae
S
617 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
618 'info_dict': {
619 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 620 'title': 'BUGGER',
88ed52ae
S
621 },
622 'playlist_count': 18,
a3bfddfa 623 }, {
6a747190 624 # single video embedded with data-playable containing vpid
10273d6e 625 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 626 'info_dict': {
627 'id': 'p02mprgb',
55ebae26 628 'ext': 'mp4',
10273d6e 629 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 630 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 631 'duration': 47,
9afa1770 632 'timestamp': 1427219242,
da92eeae 633 'upload_date': '20150324',
10273d6e 634 },
635 'params': {
9afa1770 636 # rtmp download
10273d6e 637 'skip_download': True,
638 }
a3bfddfa 639 }, {
6a747190
S
640 # article with single video embedded with data-playable containing XML playlist
641 # with direct video links as progressiveDownloadUrl (for now these are extracted)
642 # and playlist with f4m and m3u8 as streamingUrl
de939d89 643 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 644 'info_dict': {
9afa1770 645 'id': '150615_telabyad_kentin_cogu',
de939d89 646 'ext': 'mp4',
ad152e2d 647 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 648 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 649 'timestamp': 1434397334,
da92eeae 650 'upload_date': '20150615',
de939d89 651 },
652 'params': {
653 'skip_download': True,
654 }
c936d8cc 655 }, {
6a747190 656 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 657 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 658 'info_dict': {
9afa1770 659 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 660 'ext': 'mp4',
9afa1770 661 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 662 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 663 'timestamp': 1434713142,
da92eeae 664 'upload_date': '20150619',
de939d89 665 },
666 'params': {
667 'skip_download': True,
668 }
a346b1ff
S
669 }, {
670 # single video from video playlist embedded with vxp-playlist-data JSON
671 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
672 'info_dict': {
673 'id': 'p02w6qjc',
55ebae26 674 'ext': 'mp4',
a346b1ff
S
675 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
676 'duration': 56,
0bc4ee60 677 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
678 },
679 'params': {
680 'skip_download': True,
681 }
9afa1770
S
682 }, {
683 # single video story with digitalData
684 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
685 'info_dict': {
686 'id': 'p02q6gc4',
687 'ext': 'flv',
688 'title': 'Sri Lanka’s spicy secret',
689 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
690 'timestamp': 1437674293,
691 'upload_date': '20150723',
692 },
693 'params': {
694 # rtmp download
695 'skip_download': True,
696 }
697 }, {
698 # single video story without digitalData
699 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
700 'info_dict': {
701 'id': 'p018zqqg',
55ebae26 702 'ext': 'mp4',
9afa1770
S
703 'title': 'Hyundai Santa Fe Sport: Rock star',
704 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
705 'timestamp': 1415867444,
706 'upload_date': '20141113',
9afa1770
S
707 },
708 'params': {
709 # rtmp download
710 'skip_download': True,
711 }
9fb64c04
S
712 }, {
713 # single video embedded with Morph
714 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
715 'info_dict': {
716 'id': 'p041vhd0',
717 'ext': 'mp4',
718 'title': "Nigeria v Japan - Men's First Round",
719 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
720 'duration': 7980,
721 'uploader': 'BBC Sport',
722 'uploader_id': 'bbc_sport',
723 },
724 'params': {
725 # m3u8 download
726 'skip_download': True,
9fb64c04
S
727 },
728 'skip': 'Georestricted to UK',
9afa1770 729 }, {
6a747190 730 # single video with playlist.sxml URL in playlist param
9afa1770
S
731 'url': 'http://www.bbc.com/sport/0/football/33653409',
732 'info_dict': {
733 'id': 'p02xycnp',
55ebae26 734 'ext': 'mp4',
9afa1770 735 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 736 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
737 'duration': 140,
738 },
739 'params': {
740 # rtmp download
741 'skip_download': True,
742 }
b5d48cb1 743 }, {
6a747190 744 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
745 'url': 'http://www.bbc.com/sport/0/football/34475836',
746 'info_dict': {
747 'id': '34475836',
450b233c 748 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 749 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
750 },
751 'playlist_count': 3,
450b233c
S
752 }, {
753 # school report article with single video
754 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
755 'info_dict': {
756 'id': '35744779',
757 'title': 'School which breaks down barriers in Jerusalem',
758 },
759 'playlist_count': 1,
9afa1770
S
760 }, {
761 # single video with playlist URL from weather section
762 'url': 'http://www.bbc.com/weather/features/33601775',
763 'only_matching': True,
764 }, {
765 # custom redirection to www.bbc.com
766 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
767 'only_matching': True,
a1cf3e38
S
768 }, {
769 # single video article embedded with data-media-vpid
770 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
771 'only_matching': True,
10273d6e 772 }]
773
9afa1770
S
774 @classmethod
775 def suitable(cls, url):
ded7511a
S
776 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
777 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
778 else super(BBCIE, cls).suitable(url))
9afa1770
S
779
780 def _extract_from_media_meta(self, media_meta, video_id):
781 # Direct links to media in media metadata (e.g.
782 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
783 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
784 source_files = media_meta.get('sourceFiles')
785 if source_files:
786 return [{
787 'url': f['url'],
788 'format_id': format_id,
789 'ext': f.get('encoding'),
790 'tbr': float_or_none(f.get('bitrate'), 1000),
791 'filesize': int_or_none(f.get('filesize')),
792 } for format_id, f in source_files.items() if f.get('url')], []
793
794 programme_id = media_meta.get('externalId')
795 if programme_id:
796 return self._download_media_selector(programme_id)
797
798 # Process playlist.sxml as legacy playlist
799 href = media_meta.get('href')
800 if href:
801 playlist = self._download_legacy_playlist_url(href)
802 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
803 return formats, subtitles
804
805 return [], []
806
baf39a1a
S
807 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
808 programme_id, title, description, duration, formats, subtitles = \
809 self._process_legacy_playlist_url(url, playlist_id)
810 self._sort_formats(formats)
811 return {
812 'id': programme_id,
813 'title': title,
814 'description': description,
815 'duration': duration,
816 'timestamp': timestamp,
817 'formats': formats,
818 'subtitles': subtitles,
819 }
820
10273d6e 821 def _real_extract(self, url):
9afa1770
S
822 playlist_id = self._match_id(url)
823
824 webpage = self._download_webpage(url, playlist_id)
825
522f6c06 826 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 827 timestamp = json_ld_info.get('timestamp')
0e832c2c 828
350e02d4 829 playlist_title = json_ld_info.get('title')
0e832c2c
S
830 if not playlist_title:
831 playlist_title = self._og_search_title(
832 webpage, default=None) or self._html_search_regex(
833 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
834 if playlist_title:
835 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
836
837 playlist_description = json_ld_info.get(
838 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
839
840 if not timestamp:
841 timestamp = parse_iso8601(self._search_regex(
842 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
843 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 844 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 845 webpage, 'date', default=None))
9afa1770 846
78f9d843
S
847 entries = []
848
de665713
S
849 # article with multiple videos embedded with playlist.sxml (e.g.
850 # http://www.bbc.com/sport/0/football/34475836)
851 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 852 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 853 if playlists:
baf39a1a
S
854 entries = [
855 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
856 for playlist_url in playlists]
de939d89 857
78f9d843
S
858 # news article with multiple videos embedded with data-playable
859 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
860 if data_playables:
861 for _, data_playable_json in data_playables:
862 data_playable = self._parse_json(
863 unescapeHTML(data_playable_json), playlist_id, fatal=False)
864 if not data_playable:
865 continue
baf39a1a
S
866 settings = data_playable.get('settings', {})
867 if settings:
78f9d843
S
868 # data-playable with video vpid in settings.playlistObject.items (e.g.
869 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
870 playlist_object = settings.get('playlistObject', {})
871 if playlist_object:
872 items = playlist_object.get('items')
873 if items and isinstance(items, list):
78f9d843
S
874 title = playlist_object['title']
875 description = playlist_object.get('summary')
baf39a1a
S
876 duration = int_or_none(items[0].get('duration'))
877 programme_id = items[0].get('vpid')
78f9d843
S
878 formats, subtitles = self._download_media_selector(programme_id)
879 self._sort_formats(formats)
880 entries.append({
881 'id': programme_id,
882 'title': title,
883 'description': description,
884 'timestamp': timestamp,
885 'duration': duration,
886 'formats': formats,
887 'subtitles': subtitles,
888 })
889 else:
890 # data-playable without vpid but with a playlist.sxml URLs
891 # in otherSettings.playlist (e.g.
892 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
893 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
894 if playlist:
a7e5f274
RA
895 entry = None
896 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
897 playlist_url = playlist.get('%sUrl' % key)
898 if not playlist_url:
899 continue
900 try:
a7e5f274
RA
901 info = self._extract_from_playlist_sxml(
902 playlist_url, playlist_id, timestamp)
903 if not entry:
904 entry = info
905 else:
906 entry['title'] = info['title']
907 entry['formats'].extend(info['formats'])
05087d1b
S
908 except Exception as e:
909 # Some playlist URL may fail with 500, at the same time
910 # the other one may work fine (e.g.
911 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
912 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
913 continue
914 raise
a7e5f274
RA
915 if entry:
916 self._sort_formats(entry['formats'])
917 entries.append(entry)
78f9d843
S
918
919 if entries:
78f9d843
S
920 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
921
922 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
923 programme_id = self._search_regex(
a1cf3e38 924 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
925 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
926 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 927 webpage, 'vpid', default=None)
dab062fb 928
9afa1770
S
929 if programme_id:
930 formats, subtitles = self._download_media_selector(programme_id)
931 self._sort_formats(formats)
932 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
933 digital_data = self._parse_json(
934 self._search_regex(
935 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
936 programme_id, fatal=False)
937 page_info = digital_data.get('page', {}).get('pageInfo', {})
938 title = page_info.get('pageName') or self._og_search_title(webpage)
939 description = page_info.get('description') or self._og_search_description(webpage)
940 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
941 return {
942 'id': programme_id,
943 'title': title,
944 'description': description,
945 'timestamp': timestamp,
946 'formats': formats,
947 'subtitles': subtitles,
948 }
a3bfddfa 949
9fb64c04
S
950 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
951 # There are several setPayload calls may be present but the video
952 # seems to be always related to the first one
953 morph_payload = self._parse_json(
954 self._search_regex(
955 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
956 webpage, 'morph payload', default='{}'),
957 playlist_id, fatal=False)
958 if morph_payload:
959 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
960 for component in components:
961 if not isinstance(component, dict):
962 continue
963 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
964 if not lead_media:
965 continue
966 identifiers = lead_media.get('identifiers')
967 if not identifiers or not isinstance(identifiers, dict):
968 continue
969 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
970 if not programme_id:
971 continue
972 title = lead_media.get('title') or self._og_search_title(webpage)
973 formats, subtitles = self._download_media_selector(programme_id)
974 self._sort_formats(formats)
975 description = lead_media.get('summary')
976 uploader = lead_media.get('masterBrand')
977 uploader_id = lead_media.get('mid')
978 duration = None
979 duration_d = lead_media.get('duration')
980 if isinstance(duration_d, dict):
981 duration = parse_duration(dict_get(
982 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
983 return {
984 'id': programme_id,
985 'title': title,
986 'description': description,
987 'duration': duration,
988 'uploader': uploader,
989 'uploader_id': uploader_id,
990 'formats': formats,
991 'subtitles': subtitles,
992 }
993
88ed52ae
S
994 def extract_all(pattern):
995 return list(filter(None, map(
996 lambda s: self._parse_json(s, playlist_id, fatal=False),
997 re.findall(pattern, webpage))))
998
999 # Multiple video article (e.g.
1000 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1001 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1002 entries = []
1003 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1004 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1005 if embed_url and re.match(EMBED_URL, embed_url):
1006 entries.append(embed_url)
1007 entries.extend(re.findall(
1008 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1009 if entries:
1010 return self.playlist_result(
aaa42cf0 1011 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1012 playlist_id, playlist_title, playlist_description)
9afa1770
S
1013
1014 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1015 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1016
1017 if not medias:
1018 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1019 media_asset = self._search_regex(
1020 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1021 webpage, 'media asset', default=None)
1022 if media_asset:
1023 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1024 medias = []
1025 for video in media_asset_page.get('videos', {}).values():
1026 medias.extend(video.values())
1027
1028 if not medias:
1029 # Multiple video playlist with single `now playing` entry (e.g.
1030 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1031 vxp_playlist = self._parse_json(
9afa1770 1032 self._search_regex(
a346b1ff
S
1033 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1034 webpage, 'playlist data'),
9afa1770 1035 playlist_id)
a346b1ff
S
1036 playlist_medias = []
1037 for item in vxp_playlist:
1038 media = item.get('media')
1039 if not media:
1040 continue
1041 playlist_medias.append(media)
1042 # Download single video if found media with asset id matching the video id from URL
1043 if item.get('advert', {}).get('assetId') == playlist_id:
1044 medias = [media]
1045 break
1046 # Fallback to the whole playlist
1047 if not medias:
1048 medias = playlist_medias
9afa1770
S
1049
1050 entries = []
1051 for num, media_meta in enumerate(medias, start=1):
1052 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1053 if not formats:
1054 continue
10273d6e 1055 self._sort_formats(formats)
1056
9afa1770
S
1057 video_id = media_meta.get('externalId')
1058 if not video_id:
1059 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1060
1061 title = media_meta.get('caption')
1062 if not title:
1063 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1064
1065 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1066
9afa1770
S
1067 images = []
1068 for image in media_meta.get('images', {}).values():
1069 images.extend(image.values())
1070 if 'image' in media_meta:
1071 images.append(media_meta['image'])
1072
1073 thumbnails = [{
1074 'url': image.get('href'),
1075 'width': int_or_none(image.get('width')),
1076 'height': int_or_none(image.get('height')),
1077 } for image in images]
1078
1079 entries.append({
1080 'id': video_id,
10273d6e 1081 'title': title,
9afa1770 1082 'thumbnails': thumbnails,
10273d6e 1083 'duration': duration,
9afa1770 1084 'timestamp': timestamp,
10273d6e 1085 'formats': formats,
1086 'subtitles': subtitles,
a3bfddfa 1087 })
10273d6e 1088
9afa1770 1089 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1090
1091
1092class BBCCoUkArticleIE(InfoExtractor):
92519402 1093 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1094 IE_NAME = 'bbc.co.uk:article'
1095 IE_DESC = 'BBC articles'
1096
1097 _TEST = {
1098 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1099 'info_dict': {
1100 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1101 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1102 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1103 },
1104 'playlist_count': 4,
1105 'add_ie': ['BBCCoUk'],
1106 }
1107
1108 def _real_extract(self, url):
1109 playlist_id = self._match_id(url)
1110
1111 webpage = self._download_webpage(url, playlist_id)
1112
1113 title = self._og_search_title(webpage)
1114 description = self._og_search_description(webpage).strip()
1115
1116 entries = [self.url_result(programme_url) for programme_url in re.findall(
1117 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1118
1119 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1120
1121
1122class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1123 def _entries(self, webpage, url, playlist_id):
1124 single_page = 'page' in compat_urlparse.parse_qs(
1125 compat_urlparse.urlparse(url).query)
1126 for page_num in itertools.count(2):
1127 for video_id in re.findall(
1128 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1129 yield self.url_result(
1130 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1131 if single_page:
1132 return
1133 next_page = self._search_regex(
1134 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1135 webpage, 'next page url', default=None, group='url')
1136 if not next_page:
1137 break
1138 webpage = self._download_webpage(
1139 compat_urlparse.urljoin(url, next_page), playlist_id,
1140 'Downloading page %d' % page_num, page_num)
1141
ded7511a
S
1142 def _real_extract(self, url):
1143 playlist_id = self._match_id(url)
1144
1145 webpage = self._download_webpage(url, playlist_id)
1146
ded7511a
S
1147 title, description = self._extract_title_and_description(webpage)
1148
254e64a2
S
1149 return self.playlist_result(
1150 self._entries(webpage, url, playlist_id),
1151 playlist_id, title, description)
ded7511a
S
1152
1153
1154class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1155 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1156 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1157 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1158 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1159 _TESTS = [{
ded7511a
S
1160 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1161 'info_dict': {
1162 'id': 'b05rcz9v',
1163 'title': 'The Disappearance',
1164 'description': 'French thriller serial about a missing teenager.',
1165 },
1166 'playlist_mincount': 6,
c6668e4a 1167 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1168 }, {
1169 # Available for over a year unlike 30 days for most other programmes
1170 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1171 'info_dict': {
1172 'id': 'p02tcc32',
1173 'title': 'Bohemian Icons',
1174 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1175 },
1176 'playlist_mincount': 10,
1177 }]
ded7511a
S
1178
1179 def _extract_title_and_description(self, webpage):
1180 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1181 description = self._search_regex(
1182 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1183 webpage, 'description', fatal=False, group='value')
1184 return title, description
1185
1186
1187class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1188 IE_NAME = 'bbc.co.uk:playlist'
1189 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1190 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1191 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1192 _TESTS = [{
1193 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1194 'info_dict': {
1195 'id': 'b05rcz9v',
1196 'title': 'The Disappearance - Clips - BBC Four',
1197 'description': 'French thriller serial about a missing teenager.',
1198 },
1199 'playlist_mincount': 7,
4f640f28
S
1200 }, {
1201 # multipage playlist, explicit page
1202 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1203 'info_dict': {
1204 'id': 'b00mfl7n',
1205 'title': 'Frozen Planet - Clips - BBC One',
1206 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1207 },
1208 'playlist_mincount': 24,
1209 }, {
1210 # multipage playlist, all pages
1211 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1212 'info_dict': {
1213 'id': 'b00mfl7n',
1214 'title': 'Frozen Planet - Clips - BBC One',
1215 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1216 },
1217 'playlist_mincount': 142,
ded7511a
S
1218 }, {
1219 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1220 'only_matching': True,
1221 }, {
1222 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1223 'only_matching': True,
1224 }, {
1225 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1226 'only_matching': True,
1227 }]
1228
1229 def _extract_title_and_description(self, webpage):
1230 title = self._og_search_title(webpage, fatal=False)
1231 description = self._og_search_description(webpage)
1232 return title, description