]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[apa] Add extractor (closes #15041, closes #15672)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
9afa1770
S
15 parse_duration,
16 parse_iso8601,
9fb64c04 17 try_get,
dab062fb 18 unescapeHTML,
97067db2
S
19 urlencode_postdata,
20 urljoin,
8683b4d8 21)
36e6f62c
JMF
22from ..compat import (
23 compat_etree_fromstring,
24 compat_HTTPError,
254e64a2 25 compat_urlparse,
36e6f62c 26)
082c6c86 27
d12a1a47 28
f13b1e7d 29class BBCCoUkIE(InfoExtractor):
082c6c86 30 IE_NAME = 'bbc.co.uk'
2e3fd9ec 31 IE_DESC = 'BBC iPlayer'
53647dfd 32 _ID_REGEX = r'[pbw][\da-z]{7}'
f20a11ed
S
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:www\.)?bbc\.co\.uk/
36 (?:
37 programmes/(?!articles/)|
38 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 39 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
40 radio/player/|
41 events/[^/]+/play/[^/]+/
f20a11ed 42 )
ded7511a 43 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 44 ''' % _ID_REGEX
082c6c86 45
97067db2
S
46 _LOGIN_URL = 'https://account.bbc.com/signin'
47 _NETRC_MACHINE = 'bbc'
48
d12a1a47 49 _MEDIASELECTOR_URLS = [
26ccc68b
S
50 # Provides HQ HLS streams with even better quality that pc mediaset but fails
51 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 52 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 53 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
54 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
55 ]
a8b081a0 56
e6174ee9
S
57 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
58 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
59
60 _NAMESPACES = (
61 _MEDIASELECTION_NS,
62 _EMP_PLAYLIST_NS,
63 )
64
2e3fd9ec
S
65 _TESTS = [
66 {
f2d0fc68 67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 68 'info_dict': {
f2d0fc68 69 'id': 'b039d07m',
b1ea6802 70 'ext': 'flv',
679bacf0 71 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
73 },
74 'params': {
b1ea6802 75 # rtmp download
2e3fd9ec
S
76 'skip_download': True,
77 }
082c6c86 78 },
2e3fd9ec
S
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
c7f0177f
S
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
17968e44 99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 101 'duration': 5100,
2e3fd9ec
S
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
b1ea6802 107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
b1ea6802 122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
a3ef0e1c
YCH
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 137 }, {
5aa535c3 138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
139 'note': 'Audio',
140 'info_dict': {
5aa535c3 141 'id': 'p022h44j',
b1ea6802 142 'ext': 'flv',
5aa535c3
S
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
c7e67594
S
146 },
147 'params': {
b1ea6802 148 # rtmp download
c7e67594
S
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
b1ea6802 156 'ext': 'flv',
c7e67594
S
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
b1ea6802 162 # rtmp download
c7e67594
S
163 'skip_download': True,
164 }
e68ae99a
S
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
b1ea6802 178 'skip': 'geolocation',
25fa8d66
YCH
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
b1ea6802 192 'skip': 'geolocation',
54914380
S
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
5aa535c3 196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 197 'info_dict': {
5aa535c3 198 'id': 'b06rkms3',
54914380 199 'ext': 'flv',
5aa535c3
S
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
b1ea6802 207 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
208 }, {
209 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
b1ea6802 213 'ext': 'flv',
1ac6e794
S
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
b1ea6802 218 # rtmp download
1ac6e794
S
219 'skip_download': True,
220 },
31763975
S
221 }, {
222 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
223 'only_matching': True,
c7e67594
S
224 }, {
225 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
226 'only_matching': True,
0692ef86
S
227 }, {
228 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
229 'only_matching': True,
f20a11ed
S
230 }, {
231 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
232 'only_matching': True,
72d256c4
S
233 }, {
234 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
235 'only_matching': True,
53647dfd
S
236 }, {
237 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
238 'only_matching': True,
72d256c4 239 }]
2e3fd9ec 240
97eb9bd2
RA
241 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
242
97067db2
S
243 def _login(self):
244 username, password = self._get_login_info()
245 if username is None:
246 return
247
248 login_page = self._download_webpage(
249 self._LOGIN_URL, None, 'Downloading signin page')
250
251 login_form = self._hidden_inputs(login_page)
252
253 login_form.update({
254 'username': username,
255 'password': password,
256 })
257
258 post_url = urljoin(self._LOGIN_URL, self._search_regex(
259 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
260 'post url', default=self._LOGIN_URL, group='url'))
261
262 response, urlh = self._download_webpage_handle(
263 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
264 headers={'Referer': self._LOGIN_URL})
265
266 if self._LOGIN_URL in urlh.geturl():
267 error = clean_html(get_element_by_class('form-message', response))
268 if error:
269 raise ExtractorError(
270 'Unable to login: %s' % error, expected=True)
271 raise ExtractorError('Unable to log in')
272
273 def _real_initialize(self):
274 self._login()
275
d12a1a47
S
276 class MediaSelectionError(Exception):
277 def __init__(self, id):
278 self.id = id
279
2e3fd9ec
S
280 def _extract_asx_playlist(self, connection, programme_id):
281 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
282 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
283
2e3fd9ec 284 def _extract_items(self, playlist):
e6174ee9
S
285 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
286
287 def _findall_ns(self, element, xpath):
288 elements = []
289 for ns in self._NAMESPACES:
290 elements.extend(element.findall(xpath % ns))
291 return elements
2e3fd9ec
S
292
293 def _extract_medias(self, media_selection):
e6174ee9
S
294 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
295 if error is None:
296 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 297 if error is not None:
d12a1a47 298 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 299 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
300
301 def _extract_connections(self, media):
e6174ee9 302 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 303
f13b1e7d 304 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
305 subtitles = {}
306 for connection in self._extract_connections(media):
307 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
308 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
309 subtitles[lang] = [
310 {
311 'url': connection.get('href'),
312 'ext': 'ttml',
313 },
f13b1e7d 314 ]
2e3fd9ec 315 return subtitles
082c6c86 316
d12a1a47
S
317 def _raise_extractor_error(self, media_selection_error):
318 raise ExtractorError(
319 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
320 expected=True)
321
c056efa2 322 def _download_media_selector(self, programme_id):
d12a1a47
S
323 last_exception = None
324 for mediaselector_url in self._MEDIASELECTOR_URLS:
325 try:
326 return self._download_media_selector_url(
327 mediaselector_url % programme_id, programme_id)
328 except BBCCoUkIE.MediaSelectionError as e:
d781e293 329 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
330 last_exception = e
331 continue
332 self._raise_extractor_error(e)
333 self._raise_extractor_error(last_exception)
9afa1770
S
334
335 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
336 try:
337 media_selection = self._download_xml(
9afa1770 338 url, programme_id, 'Downloading media selection XML')
c056efa2 339 except ExtractorError as ee:
d781e293 340 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 341 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 342 else:
c056efa2 343 raise
9afa1770 344 return self._process_media_selector(media_selection, programme_id)
082c6c86 345
9afa1770 346 def _process_media_selector(self, media_selection, programme_id):
082c6c86 347 formats = []
2e3fd9ec 348 subtitles = None
b0af1215 349 urls = []
2e3fd9ec 350
c056efa2
S
351 for media in self._extract_medias(media_selection):
352 kind = media.get('kind')
a7e5f274
RA
353 if kind in ('video', 'audio'):
354 bitrate = int_or_none(media.get('bitrate'))
355 encoding = media.get('encoding')
356 service = media.get('service')
357 width = int_or_none(media.get('width'))
358 height = int_or_none(media.get('height'))
359 file_size = int_or_none(media.get('media_file_size'))
360 for connection in self._extract_connections(media):
b0af1215
RA
361 href = connection.get('href')
362 if href in urls:
363 continue
364 if href:
365 urls.append(href)
a7e5f274
RA
366 conn_kind = connection.get('kind')
367 protocol = connection.get('protocol')
368 supplier = connection.get('supplier')
a7e5f274
RA
369 transfer_format = connection.get('transferFormat')
370 format_id = supplier or conn_kind or protocol
371 if service:
372 format_id = '%s_%s' % (service, format_id)
373 # ASX playlist
374 if supplier == 'asx':
375 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
376 formats.append({
377 'url': ref,
378 'format_id': 'ref%s_%s' % (i, format_id),
379 })
380 elif transfer_format == 'dash':
381 formats.extend(self._extract_mpd_formats(
382 href, programme_id, mpd_id=format_id, fatal=False))
383 elif transfer_format == 'hls':
384 formats.extend(self._extract_m3u8_formats(
385 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
386 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
387 if re.search(self._USP_RE, href):
388 usp_formats = self._extract_m3u8_formats(
6b2d8c91 389 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
97eb9bd2
RA
390 programme_id, ext='mp4', entry_protocol='m3u8_native',
391 m3u8_id=format_id, fatal=False)
392 for f in usp_formats:
393 if f.get('height') and f['height'] > 720:
394 continue
395 formats.append(f)
a7e5f274
RA
396 elif transfer_format == 'hds':
397 formats.extend(self._extract_f4m_formats(
398 href, programme_id, f4m_id=format_id, fatal=False))
399 else:
f9622868 400 if not service and not supplier and bitrate:
aaa42cf0 401 format_id += '-%d' % bitrate
a7e5f274
RA
402 fmt = {
403 'format_id': format_id,
404 'filesize': file_size,
405 }
406 if kind == 'video':
407 fmt.update({
408 'width': width,
409 'height': height,
6240925b 410 'tbr': bitrate,
a7e5f274
RA
411 'vcodec': encoding,
412 })
413 else:
414 fmt.update({
415 'abr': bitrate,
416 'acodec': encoding,
417 'vcodec': 'none',
418 })
1af959ef 419 if protocol in ('http', 'https'):
a7e5f274
RA
420 # Direct link
421 fmt.update({
422 'url': href,
423 })
424 elif protocol == 'rtmp':
425 application = connection.get('application', 'ondemand')
426 auth_string = connection.get('authString')
427 identifier = connection.get('identifier')
428 server = connection.get('server')
429 fmt.update({
430 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
431 'play_path': identifier,
432 'app': '%s?%s' % (application, auth_string),
433 'page_url': 'http://www.bbc.co.uk',
434 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
435 'rtmp_live': False,
436 'ext': 'flv',
437 })
964744af
S
438 else:
439 continue
a7e5f274 440 formats.append(fmt)
c056efa2 441 elif kind == 'captions':
f13b1e7d 442 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 443 return formats, subtitles
2e3fd9ec 444
ae6986fb
S
445 def _download_playlist(self, playlist_id):
446 try:
447 playlist = self._download_json(
448 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
449 playlist_id, 'Downloading playlist JSON')
450
451 version = playlist.get('defaultAvailableVersion')
452 if version:
453 smp_config = version['smpConfig']
454 title = smp_config['title']
455 description = smp_config['summary']
456 for item in smp_config['items']:
457 kind = item['kind']
40fcba5e 458 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
459 continue
460 programme_id = item.get('vpid')
d97f5cd7 461 duration = int_or_none(item.get('duration'))
ae6986fb
S
462 formats, subtitles = self._download_media_selector(programme_id)
463 return programme_id, title, description, duration, formats, subtitles
464 except ExtractorError as ee:
f813928e 465 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
466 raise
467
468 # fallback to legacy playlist
9afa1770
S
469 return self._process_legacy_playlist(playlist_id)
470
471 def _process_legacy_playlist_url(self, url, display_id):
472 playlist = self._download_legacy_playlist_url(url, display_id)
473 return self._extract_from_legacy_playlist(playlist, display_id)
474
475 def _process_legacy_playlist(self, playlist_id):
476 return self._process_legacy_playlist_url(
477 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
478
479 def _download_legacy_playlist_url(self, url, playlist_id=None):
480 return self._download_xml(
481 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 482
9afa1770 483 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 484 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
485 if no_items is not None:
486 reason = no_items.get('reason')
487 if reason == 'preAvailability':
488 msg = 'Episode %s is not yet available' % playlist_id
489 elif reason == 'postAvailability':
490 msg = 'Episode %s is no longer available' % playlist_id
491 elif reason == 'noMedia':
492 msg = 'Episode %s is not currently available' % playlist_id
493 else:
494 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
495 raise ExtractorError(msg, expected=True)
496
497 for item in self._extract_items(playlist):
498 kind = item.get('kind')
40fcba5e 499 if kind not in ('programme', 'radioProgramme'):
ae6986fb 500 continue
e6174ee9
S
501 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
502 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 503 description = description_el.text if description_el is not None else None
9afa1770
S
504
505 def get_programme_id(item):
506 def get_from_attributes(item):
507 for p in('identifier', 'group'):
508 value = item.get(p)
509 if value and re.match(r'^[pb][\da-z]{7}$', value):
510 return value
511 get_from_attributes(item)
e6174ee9 512 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
513 if mediator is not None:
514 return get_from_attributes(mediator)
515
516 programme_id = get_programme_id(item)
d97f5cd7 517 duration = int_or_none(item.get('duration'))
e6174ee9
S
518
519 if programme_id:
520 formats, subtitles = self._download_media_selector(programme_id)
521 else:
522 formats, subtitles = self._process_media_selector(item, playlist_id)
523 programme_id = playlist_id
ae6986fb
S
524
525 return programme_id, title, description, duration, formats, subtitles
526
c056efa2
S
527 def _real_extract(self, url):
528 group_id = self._match_id(url)
529
530 webpage = self._download_webpage(url, group_id, 'Downloading video page')
531
b2ed954f
S
532 error = self._search_regex(
533 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
534 webpage, 'error', default=None)
535 if error:
536 raise ExtractorError(error, expected=True)
537
8683b4d8 538 programme_id = None
679bacf0 539 duration = None
8683b4d8
S
540
541 tviplayer = self._search_regex(
542 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
543 webpage, 'player', default=None)
544
545 if tviplayer:
546 player = self._parse_json(tviplayer, group_id).get('player', {})
547 duration = int_or_none(player.get('duration'))
548 programme_id = player.get('vpid')
549
550 if not programme_id:
551 programme_id = self._search_regex(
22d7368d 552 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 553
c056efa2 554 if programme_id:
c056efa2 555 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 556 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
557 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
558 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 559 description = self._search_regex(
a8534274
S
560 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
561 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
562 webpage, 'description', default=None)
563 if not description:
564 description = self._html_search_meta('description', webpage)
c056efa2 565 else:
ae6986fb 566 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 567
082c6c86
S
568 self._sort_formats(formats)
569
570 return {
2e3fd9ec 571 'id': programme_id,
082c6c86
S
572 'title': title,
573 'description': description,
650cfd0c 574 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
575 'duration': duration,
576 'formats': formats,
2e3fd9ec 577 'subtitles': subtitles,
5f6a1245 578 }
10273d6e 579
580
9afa1770
S
581class BBCIE(BBCCoUkIE):
582 IE_NAME = 'bbc'
583 IE_DESC = 'BBC'
584 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 585
d12a1a47 586 _MEDIASELECTOR_URLS = [
55ebae26
S
587 # Provides HQ HLS streams but fails with geolocation in some cases when it's
588 # even not geo restricted at all
589 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
590 # Provides more formats, namely direct mp4 links, but fails on some videos with
591 # notukerror for non UK (?) users (e.g.
592 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
593 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
594 # Provides fewer formats, but works everywhere for everybody (hopefully)
595 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
596 ]
10273d6e 597
598 _TESTS = [{
6a747190 599 # article with multiple videos embedded with data-playable containing vpids
10273d6e 600 'url': 'http://www.bbc.com/news/world-europe-32668511',
601 'info_dict': {
602 'id': 'world-europe-32668511',
603 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 604 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 605 },
606 'playlist_count': 2,
a3bfddfa 607 }, {
6a747190 608 # article with multiple videos embedded with data-playable (more videos)
10273d6e 609 'url': 'http://www.bbc.com/news/business-28299555',
610 'info_dict': {
611 'id': 'business-28299555',
612 'title': 'Farnborough Airshow: Video highlights',
9afa1770 613 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 614 },
615 'playlist_count': 9,
9afa1770 616 'skip': 'Save time',
88ed52ae
S
617 }, {
618 # article with multiple videos embedded with `new SMP()`
6a747190 619 # broken
88ed52ae
S
620 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
621 'info_dict': {
622 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 623 'title': 'BUGGER',
88ed52ae
S
624 },
625 'playlist_count': 18,
a3bfddfa 626 }, {
6a747190 627 # single video embedded with data-playable containing vpid
10273d6e 628 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 629 'info_dict': {
630 'id': 'p02mprgb',
55ebae26 631 'ext': 'mp4',
10273d6e 632 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 633 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 634 'duration': 47,
9afa1770 635 'timestamp': 1427219242,
da92eeae 636 'upload_date': '20150324',
10273d6e 637 },
638 'params': {
9afa1770 639 # rtmp download
10273d6e 640 'skip_download': True,
641 }
a3bfddfa 642 }, {
6a747190
S
643 # article with single video embedded with data-playable containing XML playlist
644 # with direct video links as progressiveDownloadUrl (for now these are extracted)
645 # and playlist with f4m and m3u8 as streamingUrl
de939d89 646 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 647 'info_dict': {
9afa1770 648 'id': '150615_telabyad_kentin_cogu',
de939d89 649 'ext': 'mp4',
ad152e2d 650 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 651 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 652 'timestamp': 1434397334,
da92eeae 653 'upload_date': '20150615',
de939d89 654 },
655 'params': {
656 'skip_download': True,
657 }
c936d8cc 658 }, {
6a747190 659 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 660 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 661 'info_dict': {
9afa1770 662 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 663 'ext': 'mp4',
9afa1770 664 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 665 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 666 'timestamp': 1434713142,
da92eeae 667 'upload_date': '20150619',
de939d89 668 },
669 'params': {
670 'skip_download': True,
671 }
a346b1ff
S
672 }, {
673 # single video from video playlist embedded with vxp-playlist-data JSON
674 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
675 'info_dict': {
676 'id': 'p02w6qjc',
55ebae26 677 'ext': 'mp4',
a346b1ff
S
678 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
679 'duration': 56,
0bc4ee60 680 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
681 },
682 'params': {
683 'skip_download': True,
684 }
9afa1770
S
685 }, {
686 # single video story with digitalData
687 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
688 'info_dict': {
689 'id': 'p02q6gc4',
690 'ext': 'flv',
691 'title': 'Sri Lanka’s spicy secret',
692 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
693 'timestamp': 1437674293,
694 'upload_date': '20150723',
695 },
696 'params': {
697 # rtmp download
698 'skip_download': True,
699 }
700 }, {
701 # single video story without digitalData
702 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
703 'info_dict': {
704 'id': 'p018zqqg',
55ebae26 705 'ext': 'mp4',
9afa1770
S
706 'title': 'Hyundai Santa Fe Sport: Rock star',
707 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
708 'timestamp': 1415867444,
709 'upload_date': '20141113',
9afa1770
S
710 },
711 'params': {
712 # rtmp download
713 'skip_download': True,
714 }
9fb64c04
S
715 }, {
716 # single video embedded with Morph
717 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
718 'info_dict': {
719 'id': 'p041vhd0',
720 'ext': 'mp4',
721 'title': "Nigeria v Japan - Men's First Round",
722 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
723 'duration': 7980,
724 'uploader': 'BBC Sport',
725 'uploader_id': 'bbc_sport',
726 },
727 'params': {
728 # m3u8 download
729 'skip_download': True,
9fb64c04
S
730 },
731 'skip': 'Georestricted to UK',
9afa1770 732 }, {
6a747190 733 # single video with playlist.sxml URL in playlist param
9afa1770
S
734 'url': 'http://www.bbc.com/sport/0/football/33653409',
735 'info_dict': {
736 'id': 'p02xycnp',
55ebae26 737 'ext': 'mp4',
9afa1770 738 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 739 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
740 'duration': 140,
741 },
742 'params': {
743 # rtmp download
744 'skip_download': True,
745 }
b5d48cb1 746 }, {
6a747190 747 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
748 'url': 'http://www.bbc.com/sport/0/football/34475836',
749 'info_dict': {
750 'id': '34475836',
450b233c 751 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 752 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
753 },
754 'playlist_count': 3,
450b233c
S
755 }, {
756 # school report article with single video
757 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
758 'info_dict': {
759 'id': '35744779',
760 'title': 'School which breaks down barriers in Jerusalem',
761 },
762 'playlist_count': 1,
9afa1770
S
763 }, {
764 # single video with playlist URL from weather section
765 'url': 'http://www.bbc.com/weather/features/33601775',
766 'only_matching': True,
767 }, {
768 # custom redirection to www.bbc.com
769 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
770 'only_matching': True,
a1cf3e38
S
771 }, {
772 # single video article embedded with data-media-vpid
773 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
774 'only_matching': True,
10273d6e 775 }]
776
9afa1770
S
777 @classmethod
778 def suitable(cls, url):
ded7511a
S
779 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
780 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
781 else super(BBCIE, cls).suitable(url))
9afa1770
S
782
783 def _extract_from_media_meta(self, media_meta, video_id):
784 # Direct links to media in media metadata (e.g.
785 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
786 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
787 source_files = media_meta.get('sourceFiles')
788 if source_files:
789 return [{
790 'url': f['url'],
791 'format_id': format_id,
792 'ext': f.get('encoding'),
793 'tbr': float_or_none(f.get('bitrate'), 1000),
794 'filesize': int_or_none(f.get('filesize')),
795 } for format_id, f in source_files.items() if f.get('url')], []
796
797 programme_id = media_meta.get('externalId')
798 if programme_id:
799 return self._download_media_selector(programme_id)
800
801 # Process playlist.sxml as legacy playlist
802 href = media_meta.get('href')
803 if href:
804 playlist = self._download_legacy_playlist_url(href)
805 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
806 return formats, subtitles
807
808 return [], []
809
baf39a1a
S
810 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
811 programme_id, title, description, duration, formats, subtitles = \
812 self._process_legacy_playlist_url(url, playlist_id)
813 self._sort_formats(formats)
814 return {
815 'id': programme_id,
816 'title': title,
817 'description': description,
818 'duration': duration,
819 'timestamp': timestamp,
820 'formats': formats,
821 'subtitles': subtitles,
822 }
823
10273d6e 824 def _real_extract(self, url):
9afa1770
S
825 playlist_id = self._match_id(url)
826
827 webpage = self._download_webpage(url, playlist_id)
828
522f6c06 829 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 830 timestamp = json_ld_info.get('timestamp')
0e832c2c 831
350e02d4 832 playlist_title = json_ld_info.get('title')
0e832c2c
S
833 if not playlist_title:
834 playlist_title = self._og_search_title(
835 webpage, default=None) or self._html_search_regex(
836 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
837 if playlist_title:
838 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
839
840 playlist_description = json_ld_info.get(
841 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
842
843 if not timestamp:
844 timestamp = parse_iso8601(self._search_regex(
845 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
846 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 847 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 848 webpage, 'date', default=None))
9afa1770 849
78f9d843
S
850 entries = []
851
de665713
S
852 # article with multiple videos embedded with playlist.sxml (e.g.
853 # http://www.bbc.com/sport/0/football/34475836)
854 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 855 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 856 if playlists:
baf39a1a
S
857 entries = [
858 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
859 for playlist_url in playlists]
de939d89 860
78f9d843
S
861 # news article with multiple videos embedded with data-playable
862 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
863 if data_playables:
864 for _, data_playable_json in data_playables:
865 data_playable = self._parse_json(
866 unescapeHTML(data_playable_json), playlist_id, fatal=False)
867 if not data_playable:
868 continue
baf39a1a
S
869 settings = data_playable.get('settings', {})
870 if settings:
78f9d843
S
871 # data-playable with video vpid in settings.playlistObject.items (e.g.
872 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
873 playlist_object = settings.get('playlistObject', {})
874 if playlist_object:
875 items = playlist_object.get('items')
876 if items and isinstance(items, list):
78f9d843
S
877 title = playlist_object['title']
878 description = playlist_object.get('summary')
baf39a1a
S
879 duration = int_or_none(items[0].get('duration'))
880 programme_id = items[0].get('vpid')
78f9d843
S
881 formats, subtitles = self._download_media_selector(programme_id)
882 self._sort_formats(formats)
883 entries.append({
884 'id': programme_id,
885 'title': title,
886 'description': description,
887 'timestamp': timestamp,
888 'duration': duration,
889 'formats': formats,
890 'subtitles': subtitles,
891 })
892 else:
893 # data-playable without vpid but with a playlist.sxml URLs
894 # in otherSettings.playlist (e.g.
895 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
896 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
897 if playlist:
a7e5f274
RA
898 entry = None
899 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
900 playlist_url = playlist.get('%sUrl' % key)
901 if not playlist_url:
902 continue
903 try:
a7e5f274
RA
904 info = self._extract_from_playlist_sxml(
905 playlist_url, playlist_id, timestamp)
906 if not entry:
907 entry = info
908 else:
909 entry['title'] = info['title']
910 entry['formats'].extend(info['formats'])
05087d1b
S
911 except Exception as e:
912 # Some playlist URL may fail with 500, at the same time
913 # the other one may work fine (e.g.
914 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
915 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
916 continue
917 raise
a7e5f274
RA
918 if entry:
919 self._sort_formats(entry['formats'])
920 entries.append(entry)
78f9d843
S
921
922 if entries:
78f9d843
S
923 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
924
925 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
926 programme_id = self._search_regex(
a1cf3e38 927 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
928 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
929 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 930 webpage, 'vpid', default=None)
dab062fb 931
9afa1770
S
932 if programme_id:
933 formats, subtitles = self._download_media_selector(programme_id)
934 self._sort_formats(formats)
935 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
936 digital_data = self._parse_json(
937 self._search_regex(
938 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
939 programme_id, fatal=False)
940 page_info = digital_data.get('page', {}).get('pageInfo', {})
941 title = page_info.get('pageName') or self._og_search_title(webpage)
942 description = page_info.get('description') or self._og_search_description(webpage)
943 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
944 return {
945 'id': programme_id,
946 'title': title,
947 'description': description,
948 'timestamp': timestamp,
949 'formats': formats,
950 'subtitles': subtitles,
951 }
a3bfddfa 952
9fb64c04
S
953 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
954 # There are several setPayload calls may be present but the video
955 # seems to be always related to the first one
956 morph_payload = self._parse_json(
957 self._search_regex(
958 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
959 webpage, 'morph payload', default='{}'),
960 playlist_id, fatal=False)
961 if morph_payload:
962 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
963 for component in components:
964 if not isinstance(component, dict):
965 continue
966 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
967 if not lead_media:
968 continue
969 identifiers = lead_media.get('identifiers')
970 if not identifiers or not isinstance(identifiers, dict):
971 continue
972 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
973 if not programme_id:
974 continue
975 title = lead_media.get('title') or self._og_search_title(webpage)
976 formats, subtitles = self._download_media_selector(programme_id)
977 self._sort_formats(formats)
978 description = lead_media.get('summary')
979 uploader = lead_media.get('masterBrand')
980 uploader_id = lead_media.get('mid')
981 duration = None
982 duration_d = lead_media.get('duration')
983 if isinstance(duration_d, dict):
984 duration = parse_duration(dict_get(
985 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
986 return {
987 'id': programme_id,
988 'title': title,
989 'description': description,
990 'duration': duration,
991 'uploader': uploader,
992 'uploader_id': uploader_id,
993 'formats': formats,
994 'subtitles': subtitles,
995 }
996
88ed52ae
S
997 def extract_all(pattern):
998 return list(filter(None, map(
999 lambda s: self._parse_json(s, playlist_id, fatal=False),
1000 re.findall(pattern, webpage))))
1001
1002 # Multiple video article (e.g.
1003 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1004 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1005 entries = []
1006 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1007 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1008 if embed_url and re.match(EMBED_URL, embed_url):
1009 entries.append(embed_url)
1010 entries.extend(re.findall(
1011 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1012 if entries:
1013 return self.playlist_result(
aaa42cf0 1014 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1015 playlist_id, playlist_title, playlist_description)
9afa1770
S
1016
1017 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1018 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1019
1020 if not medias:
1021 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1022 media_asset = self._search_regex(
1023 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1024 webpage, 'media asset', default=None)
1025 if media_asset:
1026 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1027 medias = []
1028 for video in media_asset_page.get('videos', {}).values():
1029 medias.extend(video.values())
1030
1031 if not medias:
1032 # Multiple video playlist with single `now playing` entry (e.g.
1033 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1034 vxp_playlist = self._parse_json(
9afa1770 1035 self._search_regex(
a346b1ff
S
1036 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1037 webpage, 'playlist data'),
9afa1770 1038 playlist_id)
a346b1ff
S
1039 playlist_medias = []
1040 for item in vxp_playlist:
1041 media = item.get('media')
1042 if not media:
1043 continue
1044 playlist_medias.append(media)
1045 # Download single video if found media with asset id matching the video id from URL
1046 if item.get('advert', {}).get('assetId') == playlist_id:
1047 medias = [media]
1048 break
1049 # Fallback to the whole playlist
1050 if not medias:
1051 medias = playlist_medias
9afa1770
S
1052
1053 entries = []
1054 for num, media_meta in enumerate(medias, start=1):
1055 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1056 if not formats:
1057 continue
10273d6e 1058 self._sort_formats(formats)
1059
9afa1770
S
1060 video_id = media_meta.get('externalId')
1061 if not video_id:
1062 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1063
1064 title = media_meta.get('caption')
1065 if not title:
1066 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1067
1068 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1069
9afa1770
S
1070 images = []
1071 for image in media_meta.get('images', {}).values():
1072 images.extend(image.values())
1073 if 'image' in media_meta:
1074 images.append(media_meta['image'])
1075
1076 thumbnails = [{
1077 'url': image.get('href'),
1078 'width': int_or_none(image.get('width')),
1079 'height': int_or_none(image.get('height')),
1080 } for image in images]
1081
1082 entries.append({
1083 'id': video_id,
10273d6e 1084 'title': title,
9afa1770 1085 'thumbnails': thumbnails,
10273d6e 1086 'duration': duration,
9afa1770 1087 'timestamp': timestamp,
10273d6e 1088 'formats': formats,
1089 'subtitles': subtitles,
a3bfddfa 1090 })
10273d6e 1091
9afa1770 1092 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1093
1094
1095class BBCCoUkArticleIE(InfoExtractor):
92519402 1096 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1097 IE_NAME = 'bbc.co.uk:article'
1098 IE_DESC = 'BBC articles'
1099
1100 _TEST = {
1101 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1102 'info_dict': {
1103 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1104 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1105 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1106 },
1107 'playlist_count': 4,
1108 'add_ie': ['BBCCoUk'],
1109 }
1110
1111 def _real_extract(self, url):
1112 playlist_id = self._match_id(url)
1113
1114 webpage = self._download_webpage(url, playlist_id)
1115
1116 title = self._og_search_title(webpage)
1117 description = self._og_search_description(webpage).strip()
1118
1119 entries = [self.url_result(programme_url) for programme_url in re.findall(
1120 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1121
1122 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1123
1124
1125class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1126 def _entries(self, webpage, url, playlist_id):
1127 single_page = 'page' in compat_urlparse.parse_qs(
1128 compat_urlparse.urlparse(url).query)
1129 for page_num in itertools.count(2):
1130 for video_id in re.findall(
1131 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1132 yield self.url_result(
1133 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1134 if single_page:
1135 return
1136 next_page = self._search_regex(
1137 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1138 webpage, 'next page url', default=None, group='url')
1139 if not next_page:
1140 break
1141 webpage = self._download_webpage(
1142 compat_urlparse.urljoin(url, next_page), playlist_id,
1143 'Downloading page %d' % page_num, page_num)
1144
ded7511a
S
1145 def _real_extract(self, url):
1146 playlist_id = self._match_id(url)
1147
1148 webpage = self._download_webpage(url, playlist_id)
1149
ded7511a
S
1150 title, description = self._extract_title_and_description(webpage)
1151
254e64a2
S
1152 return self.playlist_result(
1153 self._entries(webpage, url, playlist_id),
1154 playlist_id, title, description)
ded7511a
S
1155
1156
1157class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1158 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1159 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1160 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1161 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1162 _TESTS = [{
ded7511a
S
1163 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1164 'info_dict': {
1165 'id': 'b05rcz9v',
1166 'title': 'The Disappearance',
1167 'description': 'French thriller serial about a missing teenager.',
1168 },
1169 'playlist_mincount': 6,
c6668e4a 1170 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1171 }, {
1172 # Available for over a year unlike 30 days for most other programmes
1173 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1174 'info_dict': {
1175 'id': 'p02tcc32',
1176 'title': 'Bohemian Icons',
1177 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1178 },
1179 'playlist_mincount': 10,
1180 }]
ded7511a
S
1181
1182 def _extract_title_and_description(self, webpage):
1183 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1184 description = self._search_regex(
1185 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1186 webpage, 'description', fatal=False, group='value')
1187 return title, description
1188
1189
1190class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1191 IE_NAME = 'bbc.co.uk:playlist'
1192 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1193 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1194 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1195 _TESTS = [{
1196 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1197 'info_dict': {
1198 'id': 'b05rcz9v',
1199 'title': 'The Disappearance - Clips - BBC Four',
1200 'description': 'French thriller serial about a missing teenager.',
1201 },
1202 'playlist_mincount': 7,
4f640f28
S
1203 }, {
1204 # multipage playlist, explicit page
1205 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1206 'info_dict': {
1207 'id': 'b00mfl7n',
1208 'title': 'Frozen Planet - Clips - BBC One',
1209 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1210 },
1211 'playlist_mincount': 24,
1212 }, {
1213 # multipage playlist, all pages
1214 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1215 'info_dict': {
1216 'id': 'b00mfl7n',
1217 'title': 'Frozen Planet - Clips - BBC One',
1218 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1219 },
1220 'playlist_mincount': 142,
ded7511a
S
1221 }, {
1222 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1223 'only_matching': True,
1224 }, {
1225 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1226 'only_matching': True,
1227 }, {
1228 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1229 'only_matching': True,
1230 }]
1231
1232 def _extract_title_and_description(self, webpage):
1233 title = self._og_search_title(webpage, fatal=False)
1234 description = self._og_search_description(webpage)
1235 return title, description