]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
Completely change project name to yt-dlp (#85)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
254e64a2 4import itertools
f0228f56 5import re
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
6d155707 15 js_to_json,
9afa1770
S
16 parse_duration,
17 parse_iso8601,
9fb64c04 18 try_get,
dab062fb 19 unescapeHTML,
f0228f56 20 url_or_none,
97067db2
S
21 urlencode_postdata,
22 urljoin,
8683b4d8 23)
36e6f62c 24from ..compat import (
ee0ba927 25 compat_etree_Element,
36e6f62c 26 compat_HTTPError,
254e64a2 27 compat_urlparse,
36e6f62c 28)
082c6c86 29
d12a1a47 30
f13b1e7d 31class BBCCoUkIE(InfoExtractor):
082c6c86 32 IE_NAME = 'bbc.co.uk'
2e3fd9ec 33 IE_DESC = 'BBC iPlayer'
6f356cbb 34 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
35 _VALID_URL = r'''(?x)
36 https?://
37 (?:www\.)?bbc\.co\.uk/
38 (?:
39 programmes/(?!articles/)|
40 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 41 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 42 radio/player/|
b72305f0 43 sounds/play/|
d3d45e0a 44 events/[^/]+/play/[^/]+/
f20a11ed 45 )
ded7511a 46 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 47 ''' % _ID_REGEX
082c6c86 48
97067db2
S
49 _LOGIN_URL = 'https://account.bbc.com/signin'
50 _NETRC_MACHINE = 'bbc'
51
29f7c58a 52 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
53 _MEDIA_SETS = [
26ccc68b
S
54 # Provides HQ HLS streams with even better quality that pc mediaset but fails
55 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 56 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 57 'iptv-all',
58 'pc',
d12a1a47 59 ]
a8b081a0 60
e6174ee9
S
61 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
62
2e3fd9ec
S
63 _TESTS = [
64 {
f2d0fc68 65 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 66 'info_dict': {
f2d0fc68 67 'id': 'b039d07m',
b1ea6802 68 'ext': 'flv',
acc86c9a 69 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 70 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
71 },
72 'params': {
b1ea6802 73 # rtmp download
2e3fd9ec
S
74 'skip_download': True,
75 }
082c6c86 76 },
2e3fd9ec
S
77 {
78 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
79 'info_dict': {
80 'id': 'b00yng1d',
81 'ext': 'flv',
82 'title': 'The Man in Black: Series 3: The Printed Name',
83 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
84 'duration': 1800,
85 },
86 'params': {
87 # rtmp download
88 'skip_download': True,
c7f0177f
S
89 },
90 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
91 },
92 {
93 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
94 'info_dict': {
95 'id': 'b00yng1d',
96 'ext': 'flv',
17968e44 97 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 98 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 99 'duration': 5100,
2e3fd9ec
S
100 },
101 'params': {
102 # rtmp download
103 'skip_download': True,
104 },
b1ea6802 105 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
106 },
107 {
108 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
109 'info_dict': {
110 'id': 'b03k3pb7',
111 'ext': 'flv',
112 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
113 'description': '2. Invasion',
114 'duration': 3600,
115 },
116 'params': {
117 # rtmp download
118 'skip_download': True,
119 },
b1ea6802 120 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
121 }, {
122 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
123 'info_dict': {
124 'id': 'b04v209v',
125 'ext': 'flv',
126 'title': 'Pete Tong, The Essential New Tune Special',
127 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
128 'duration': 10800,
129 },
130 'params': {
131 # rtmp download
132 'skip_download': True,
a3ef0e1c
YCH
133 },
134 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 135 }, {
5aa535c3 136 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
137 'note': 'Audio',
138 'info_dict': {
5aa535c3 139 'id': 'p022h44j',
b1ea6802 140 'ext': 'flv',
5aa535c3
S
141 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
142 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
143 'duration': 227,
c7e67594
S
144 },
145 'params': {
b1ea6802 146 # rtmp download
c7e67594
S
147 'skip_download': True,
148 }
149 }, {
150 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
151 'note': 'Video',
152 'info_dict': {
153 'id': 'p025c103',
b1ea6802 154 'ext': 'flv',
c7e67594
S
155 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
156 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
157 'duration': 226,
158 },
159 'params': {
b1ea6802 160 # rtmp download
c7e67594
S
161 'skip_download': True,
162 }
e68ae99a
S
163 }, {
164 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
165 'info_dict': {
166 'id': 'p02n76xf',
167 'ext': 'flv',
168 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
169 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
170 'duration': 3540,
171 },
172 'params': {
173 # rtmp download
174 'skip_download': True,
175 },
b1ea6802 176 'skip': 'geolocation',
25fa8d66
YCH
177 }, {
178 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
179 'info_dict': {
180 'id': 'b05zmgw1',
181 'ext': 'flv',
182 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
183 'title': 'Royal Academy Summer Exhibition',
184 'duration': 3540,
185 },
186 'params': {
187 # rtmp download
188 'skip_download': True,
189 },
b1ea6802 190 'skip': 'geolocation',
54914380
S
191 }, {
192 # iptv-all mediaset fails with geolocation however there is no geo restriction
193 # for this programme at all
5aa535c3 194 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 195 'info_dict': {
5aa535c3 196 'id': 'b06rkms3',
54914380 197 'ext': 'flv',
5aa535c3
S
198 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
199 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
200 },
201 'params': {
202 # rtmp download
203 'skip_download': True,
204 },
b1ea6802 205 'skip': 'Now it\'s really geo-restricted',
1ac6e794 206 }, {
067aa17e 207 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
208 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
209 'info_dict': {
210 'id': 'p028bfkj',
b1ea6802 211 'ext': 'flv',
1ac6e794
S
212 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
213 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
214 },
215 'params': {
b1ea6802 216 # rtmp download
1ac6e794
S
217 'skip_download': True,
218 },
b72305f0
J
219 }, {
220 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
221 'note': 'Audio',
222 'info_dict': {
223 'id': 'm0007jz9',
224 'ext': 'mp4',
225 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
226 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
227 'duration': 9840,
228 },
229 'params': {
230 # rtmp download
231 'skip_download': True,
232 }
31763975
S
233 }, {
234 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
235 'only_matching': True,
c7e67594
S
236 }, {
237 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
238 'only_matching': True,
0692ef86
S
239 }, {
240 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
241 'only_matching': True,
f20a11ed
S
242 }, {
243 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
244 'only_matching': True,
72d256c4
S
245 }, {
246 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
247 'only_matching': True,
53647dfd
S
248 }, {
249 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
250 'only_matching': True,
6f356cbb
S
251 }, {
252 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
253 'only_matching': True,
254 }, {
255 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
256 'only_matching': True,
72d256c4 257 }]
2e3fd9ec 258
97067db2
S
259 def _login(self):
260 username, password = self._get_login_info()
261 if username is None:
262 return
263
264 login_page = self._download_webpage(
265 self._LOGIN_URL, None, 'Downloading signin page')
266
267 login_form = self._hidden_inputs(login_page)
268
269 login_form.update({
270 'username': username,
271 'password': password,
272 })
273
274 post_url = urljoin(self._LOGIN_URL, self._search_regex(
275 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
276 'post url', default=self._LOGIN_URL, group='url'))
277
278 response, urlh = self._download_webpage_handle(
279 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
280 headers={'Referer': self._LOGIN_URL})
281
282 if self._LOGIN_URL in urlh.geturl():
283 error = clean_html(get_element_by_class('form-message', response))
284 if error:
285 raise ExtractorError(
286 'Unable to login: %s' % error, expected=True)
287 raise ExtractorError('Unable to log in')
288
289 def _real_initialize(self):
290 self._login()
291
d12a1a47
S
292 class MediaSelectionError(Exception):
293 def __init__(self, id):
294 self.id = id
295
2e3fd9ec
S
296 def _extract_asx_playlist(self, connection, programme_id):
297 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
298 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
299
2e3fd9ec 300 def _extract_items(self, playlist):
e6174ee9
S
301 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
302
2e3fd9ec 303 def _extract_medias(self, media_selection):
29f7c58a 304 error = media_selection.get('result')
305 if error:
306 raise BBCCoUkIE.MediaSelectionError(error)
307 return media_selection.get('media') or []
2e3fd9ec
S
308
309 def _extract_connections(self, media):
29f7c58a 310 return media.get('connection') or []
2e3fd9ec 311
f13b1e7d 312 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
313 subtitles = {}
314 for connection in self._extract_connections(media):
f0228f56
S
315 cc_url = url_or_none(connection.get('href'))
316 if not cc_url:
317 continue
318 captions = self._download_xml(
319 cc_url, programme_id, 'Downloading captions', fatal=False)
ee0ba927 320 if not isinstance(captions, compat_etree_Element):
f0228f56 321 continue
29f7c58a 322 subtitles['en'] = [
f13b1e7d
JMF
323 {
324 'url': connection.get('href'),
325 'ext': 'ttml',
326 },
f13b1e7d 327 ]
29f7c58a 328 break
2e3fd9ec 329 return subtitles
082c6c86 330
d12a1a47
S
331 def _raise_extractor_error(self, media_selection_error):
332 raise ExtractorError(
333 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
334 expected=True)
335
c056efa2 336 def _download_media_selector(self, programme_id):
d12a1a47 337 last_exception = None
29f7c58a 338 for media_set in self._MEDIA_SETS:
d12a1a47
S
339 try:
340 return self._download_media_selector_url(
29f7c58a 341 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 342 except BBCCoUkIE.MediaSelectionError as e:
d781e293 343 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
344 last_exception = e
345 continue
346 self._raise_extractor_error(e)
347 self._raise_extractor_error(last_exception)
9afa1770
S
348
349 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 350 media_selection = self._download_json(
351 url, programme_id, 'Downloading media selection JSON',
9283d4ea 352 expected_status=(403, 404))
9afa1770 353 return self._process_media_selector(media_selection, programme_id)
082c6c86 354
9afa1770 355 def _process_media_selector(self, media_selection, programme_id):
082c6c86 356 formats = []
2e3fd9ec 357 subtitles = None
b0af1215 358 urls = []
2e3fd9ec 359
c056efa2
S
360 for media in self._extract_medias(media_selection):
361 kind = media.get('kind')
a7e5f274
RA
362 if kind in ('video', 'audio'):
363 bitrate = int_or_none(media.get('bitrate'))
364 encoding = media.get('encoding')
a7e5f274
RA
365 width = int_or_none(media.get('width'))
366 height = int_or_none(media.get('height'))
367 file_size = int_or_none(media.get('media_file_size'))
368 for connection in self._extract_connections(media):
b0af1215
RA
369 href = connection.get('href')
370 if href in urls:
371 continue
372 if href:
373 urls.append(href)
a7e5f274
RA
374 conn_kind = connection.get('kind')
375 protocol = connection.get('protocol')
376 supplier = connection.get('supplier')
a7e5f274
RA
377 transfer_format = connection.get('transferFormat')
378 format_id = supplier or conn_kind or protocol
a7e5f274
RA
379 # ASX playlist
380 if supplier == 'asx':
381 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
382 formats.append({
383 'url': ref,
384 'format_id': 'ref%s_%s' % (i, format_id),
385 })
386 elif transfer_format == 'dash':
387 formats.extend(self._extract_mpd_formats(
388 href, programme_id, mpd_id=format_id, fatal=False))
389 elif transfer_format == 'hls':
390 formats.extend(self._extract_m3u8_formats(
391 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
392 m3u8_id=format_id, fatal=False))
393 elif transfer_format == 'hds':
394 formats.extend(self._extract_f4m_formats(
395 href, programme_id, f4m_id=format_id, fatal=False))
396 else:
29f7c58a 397 if not supplier and bitrate:
aaa42cf0 398 format_id += '-%d' % bitrate
a7e5f274
RA
399 fmt = {
400 'format_id': format_id,
401 'filesize': file_size,
402 }
403 if kind == 'video':
404 fmt.update({
405 'width': width,
406 'height': height,
6240925b 407 'tbr': bitrate,
a7e5f274
RA
408 'vcodec': encoding,
409 })
410 else:
411 fmt.update({
412 'abr': bitrate,
413 'acodec': encoding,
414 'vcodec': 'none',
415 })
1af959ef 416 if protocol in ('http', 'https'):
a7e5f274
RA
417 # Direct link
418 fmt.update({
419 'url': href,
420 })
421 elif protocol == 'rtmp':
422 application = connection.get('application', 'ondemand')
423 auth_string = connection.get('authString')
424 identifier = connection.get('identifier')
425 server = connection.get('server')
426 fmt.update({
427 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
428 'play_path': identifier,
429 'app': '%s?%s' % (application, auth_string),
430 'page_url': 'http://www.bbc.co.uk',
431 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
432 'rtmp_live': False,
433 'ext': 'flv',
434 })
964744af
S
435 else:
436 continue
a7e5f274 437 formats.append(fmt)
c056efa2 438 elif kind == 'captions':
f13b1e7d 439 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 440 return formats, subtitles
2e3fd9ec 441
ae6986fb
S
442 def _download_playlist(self, playlist_id):
443 try:
444 playlist = self._download_json(
445 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
446 playlist_id, 'Downloading playlist JSON')
447
448 version = playlist.get('defaultAvailableVersion')
449 if version:
450 smp_config = version['smpConfig']
451 title = smp_config['title']
452 description = smp_config['summary']
453 for item in smp_config['items']:
454 kind = item['kind']
40fcba5e 455 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
456 continue
457 programme_id = item.get('vpid')
d97f5cd7 458 duration = int_or_none(item.get('duration'))
ae6986fb
S
459 formats, subtitles = self._download_media_selector(programme_id)
460 return programme_id, title, description, duration, formats, subtitles
461 except ExtractorError as ee:
f813928e 462 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
463 raise
464
465 # fallback to legacy playlist
9afa1770
S
466 return self._process_legacy_playlist(playlist_id)
467
468 def _process_legacy_playlist_url(self, url, display_id):
469 playlist = self._download_legacy_playlist_url(url, display_id)
470 return self._extract_from_legacy_playlist(playlist, display_id)
471
472 def _process_legacy_playlist(self, playlist_id):
473 return self._process_legacy_playlist_url(
474 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
475
476 def _download_legacy_playlist_url(self, url, playlist_id=None):
477 return self._download_xml(
478 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 479
9afa1770 480 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 481 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
482 if no_items is not None:
483 reason = no_items.get('reason')
484 if reason == 'preAvailability':
485 msg = 'Episode %s is not yet available' % playlist_id
486 elif reason == 'postAvailability':
487 msg = 'Episode %s is no longer available' % playlist_id
488 elif reason == 'noMedia':
489 msg = 'Episode %s is not currently available' % playlist_id
490 else:
491 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
492 raise ExtractorError(msg, expected=True)
493
494 for item in self._extract_items(playlist):
495 kind = item.get('kind')
40fcba5e 496 if kind not in ('programme', 'radioProgramme'):
ae6986fb 497 continue
e6174ee9
S
498 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
499 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 500 description = description_el.text if description_el is not None else None
9afa1770
S
501
502 def get_programme_id(item):
503 def get_from_attributes(item):
32759325 504 for p in ('identifier', 'group'):
9afa1770
S
505 value = item.get(p)
506 if value and re.match(r'^[pb][\da-z]{7}$', value):
507 return value
508 get_from_attributes(item)
e6174ee9 509 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
510 if mediator is not None:
511 return get_from_attributes(mediator)
512
513 programme_id = get_programme_id(item)
d97f5cd7 514 duration = int_or_none(item.get('duration'))
e6174ee9
S
515
516 if programme_id:
517 formats, subtitles = self._download_media_selector(programme_id)
518 else:
519 formats, subtitles = self._process_media_selector(item, playlist_id)
520 programme_id = playlist_id
ae6986fb
S
521
522 return programme_id, title, description, duration, formats, subtitles
523
c056efa2
S
524 def _real_extract(self, url):
525 group_id = self._match_id(url)
526
527 webpage = self._download_webpage(url, group_id, 'Downloading video page')
528
b2ed954f 529 error = self._search_regex(
29f7c58a 530 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
531 webpage, 'error', default=None)
532 if error:
533 raise ExtractorError(error, expected=True)
534
8683b4d8 535 programme_id = None
679bacf0 536 duration = None
8683b4d8
S
537
538 tviplayer = self._search_regex(
539 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
540 webpage, 'player', default=None)
541
542 if tviplayer:
543 player = self._parse_json(tviplayer, group_id).get('player', {})
544 duration = int_or_none(player.get('duration'))
545 programme_id = player.get('vpid')
546
547 if not programme_id:
548 programme_id = self._search_regex(
22d7368d 549 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 550
c056efa2 551 if programme_id:
c056efa2 552 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 553 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
554 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
555 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 556 description = self._search_regex(
a8534274
S
557 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
558 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
559 webpage, 'description', default=None)
560 if not description:
561 description = self._html_search_meta('description', webpage)
c056efa2 562 else:
ae6986fb 563 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 564
082c6c86
S
565 self._sort_formats(formats)
566
567 return {
2e3fd9ec 568 'id': programme_id,
082c6c86
S
569 'title': title,
570 'description': description,
650cfd0c 571 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
572 'duration': duration,
573 'formats': formats,
2e3fd9ec 574 'subtitles': subtitles,
5f6a1245 575 }
10273d6e 576
577
9afa1770
S
578class BBCIE(BBCCoUkIE):
579 IE_NAME = 'bbc'
580 IE_DESC = 'BBC'
581 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 582
29f7c58a 583 _MEDIA_SETS = [
584 'mobile-tablet-main',
585 'pc',
d12a1a47 586 ]
10273d6e 587
588 _TESTS = [{
6a747190 589 # article with multiple videos embedded with data-playable containing vpids
10273d6e 590 'url': 'http://www.bbc.com/news/world-europe-32668511',
591 'info_dict': {
592 'id': 'world-europe-32668511',
acc86c9a 593 'title': 'Russia stages massive WW2 parade',
9afa1770 594 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 595 },
596 'playlist_count': 2,
a3bfddfa 597 }, {
6a747190 598 # article with multiple videos embedded with data-playable (more videos)
10273d6e 599 'url': 'http://www.bbc.com/news/business-28299555',
600 'info_dict': {
601 'id': 'business-28299555',
602 'title': 'Farnborough Airshow: Video highlights',
9afa1770 603 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 604 },
605 'playlist_count': 9,
9afa1770 606 'skip': 'Save time',
88ed52ae
S
607 }, {
608 # article with multiple videos embedded with `new SMP()`
6a747190 609 # broken
88ed52ae
S
610 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
611 'info_dict': {
612 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 613 'title': 'BUGGER',
88ed52ae
S
614 },
615 'playlist_count': 18,
a3bfddfa 616 }, {
6a747190 617 # single video embedded with data-playable containing vpid
10273d6e 618 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 619 'info_dict': {
620 'id': 'p02mprgb',
55ebae26 621 'ext': 'mp4',
10273d6e 622 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 623 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 624 'duration': 47,
9afa1770 625 'timestamp': 1427219242,
da92eeae 626 'upload_date': '20150324',
10273d6e 627 },
628 'params': {
9afa1770 629 # rtmp download
10273d6e 630 'skip_download': True,
631 }
a3bfddfa 632 }, {
6a747190
S
633 # article with single video embedded with data-playable containing XML playlist
634 # with direct video links as progressiveDownloadUrl (for now these are extracted)
635 # and playlist with f4m and m3u8 as streamingUrl
de939d89 636 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 637 'info_dict': {
9afa1770 638 'id': '150615_telabyad_kentin_cogu',
de939d89 639 'ext': 'mp4',
ad152e2d 640 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 641 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 642 'timestamp': 1434397334,
da92eeae 643 'upload_date': '20150615',
de939d89 644 },
645 'params': {
646 'skip_download': True,
647 }
c936d8cc 648 }, {
6a747190 649 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 650 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 651 'info_dict': {
9afa1770 652 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 653 'ext': 'mp4',
9afa1770 654 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 655 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 656 'timestamp': 1434713142,
da92eeae 657 'upload_date': '20150619',
de939d89 658 },
659 'params': {
660 'skip_download': True,
661 }
a346b1ff
S
662 }, {
663 # single video from video playlist embedded with vxp-playlist-data JSON
664 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
665 'info_dict': {
666 'id': 'p02w6qjc',
55ebae26 667 'ext': 'mp4',
a346b1ff
S
668 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
669 'duration': 56,
0bc4ee60 670 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
671 },
672 'params': {
673 'skip_download': True,
674 }
9afa1770
S
675 }, {
676 # single video story with digitalData
677 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
678 'info_dict': {
679 'id': 'p02q6gc4',
680 'ext': 'flv',
681 'title': 'Sri Lanka’s spicy secret',
682 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
683 'timestamp': 1437674293,
684 'upload_date': '20150723',
685 },
686 'params': {
687 # rtmp download
688 'skip_download': True,
689 }
690 }, {
691 # single video story without digitalData
692 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
693 'info_dict': {
694 'id': 'p018zqqg',
55ebae26 695 'ext': 'mp4',
9afa1770
S
696 'title': 'Hyundai Santa Fe Sport: Rock star',
697 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
698 'timestamp': 1415867444,
699 'upload_date': '20141113',
9afa1770
S
700 },
701 'params': {
702 # rtmp download
703 'skip_download': True,
704 }
9fb64c04
S
705 }, {
706 # single video embedded with Morph
707 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
708 'info_dict': {
709 'id': 'p041vhd0',
710 'ext': 'mp4',
711 'title': "Nigeria v Japan - Men's First Round",
712 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
713 'duration': 7980,
714 'uploader': 'BBC Sport',
715 'uploader_id': 'bbc_sport',
716 },
717 'params': {
718 # m3u8 download
719 'skip_download': True,
9fb64c04
S
720 },
721 'skip': 'Georestricted to UK',
9afa1770 722 }, {
6a747190 723 # single video with playlist.sxml URL in playlist param
9afa1770
S
724 'url': 'http://www.bbc.com/sport/0/football/33653409',
725 'info_dict': {
726 'id': 'p02xycnp',
55ebae26 727 'ext': 'mp4',
9afa1770 728 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 729 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
730 'duration': 140,
731 },
732 'params': {
733 # rtmp download
734 'skip_download': True,
735 }
b5d48cb1 736 }, {
6a747190 737 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
738 'url': 'http://www.bbc.com/sport/0/football/34475836',
739 'info_dict': {
740 'id': '34475836',
450b233c 741 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 742 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
743 },
744 'playlist_count': 3,
450b233c
S
745 }, {
746 # school report article with single video
747 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
748 'info_dict': {
749 'id': '35744779',
750 'title': 'School which breaks down barriers in Jerusalem',
751 },
752 'playlist_count': 1,
9afa1770
S
753 }, {
754 # single video with playlist URL from weather section
755 'url': 'http://www.bbc.com/weather/features/33601775',
756 'only_matching': True,
757 }, {
758 # custom redirection to www.bbc.com
759 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
760 'only_matching': True,
a1cf3e38
S
761 }, {
762 # single video article embedded with data-media-vpid
763 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
764 'only_matching': True,
6d155707
S
765 }, {
766 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
767 'info_dict': {
768 'id': 'p06556y7',
769 'ext': 'mp4',
770 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
771 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
772 },
773 'params': {
774 'skip_download': True,
775 }
b96b4be4
RA
776 }, {
777 # window.__PRELOADED_STATE__
778 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
779 'info_dict': {
780 'id': 'b0b9z4vz',
781 'ext': 'mp4',
782 'title': 'Prom 6: An American in Paris and Turangalila',
783 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
784 'uploader': 'Radio 3',
785 'uploader_id': 'bbc_radio_three',
786 },
373941c5
S
787 }, {
788 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
789 'info_dict': {
790 'id': 'p06w9tws',
791 'ext': 'mp4',
792 'title': 'md5:2fabf12a726603193a2879a055f72514',
793 'description': 'Learn English words and phrases from this story',
794 },
795 'add_ie': [BBCCoUkIE.ie_key()],
10273d6e 796 }]
797
9afa1770
S
798 @classmethod
799 def suitable(cls, url):
ded7511a
S
800 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
801 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
802 else super(BBCIE, cls).suitable(url))
9afa1770
S
803
804 def _extract_from_media_meta(self, media_meta, video_id):
805 # Direct links to media in media metadata (e.g.
806 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
807 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
808 source_files = media_meta.get('sourceFiles')
809 if source_files:
810 return [{
811 'url': f['url'],
812 'format_id': format_id,
813 'ext': f.get('encoding'),
814 'tbr': float_or_none(f.get('bitrate'), 1000),
815 'filesize': int_or_none(f.get('filesize')),
816 } for format_id, f in source_files.items() if f.get('url')], []
817
818 programme_id = media_meta.get('externalId')
819 if programme_id:
820 return self._download_media_selector(programme_id)
821
822 # Process playlist.sxml as legacy playlist
823 href = media_meta.get('href')
824 if href:
825 playlist = self._download_legacy_playlist_url(href)
826 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
827 return formats, subtitles
828
829 return [], []
830
baf39a1a
S
831 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
832 programme_id, title, description, duration, formats, subtitles = \
833 self._process_legacy_playlist_url(url, playlist_id)
834 self._sort_formats(formats)
835 return {
836 'id': programme_id,
837 'title': title,
838 'description': description,
839 'duration': duration,
840 'timestamp': timestamp,
841 'formats': formats,
842 'subtitles': subtitles,
843 }
844
10273d6e 845 def _real_extract(self, url):
9afa1770
S
846 playlist_id = self._match_id(url)
847
848 webpage = self._download_webpage(url, playlist_id)
849
522f6c06 850 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 851 timestamp = json_ld_info.get('timestamp')
0e832c2c 852
350e02d4 853 playlist_title = json_ld_info.get('title')
0e832c2c
S
854 if not playlist_title:
855 playlist_title = self._og_search_title(
856 webpage, default=None) or self._html_search_regex(
857 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
858 if playlist_title:
859 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
860
861 playlist_description = json_ld_info.get(
862 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
863
864 if not timestamp:
865 timestamp = parse_iso8601(self._search_regex(
866 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
867 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 868 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 869 webpage, 'date', default=None))
9afa1770 870
78f9d843
S
871 entries = []
872
de665713
S
873 # article with multiple videos embedded with playlist.sxml (e.g.
874 # http://www.bbc.com/sport/0/football/34475836)
875 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 876 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 877 if playlists:
baf39a1a
S
878 entries = [
879 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
880 for playlist_url in playlists]
de939d89 881
78f9d843
S
882 # news article with multiple videos embedded with data-playable
883 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
884 if data_playables:
885 for _, data_playable_json in data_playables:
886 data_playable = self._parse_json(
887 unescapeHTML(data_playable_json), playlist_id, fatal=False)
888 if not data_playable:
889 continue
baf39a1a
S
890 settings = data_playable.get('settings', {})
891 if settings:
78f9d843
S
892 # data-playable with video vpid in settings.playlistObject.items (e.g.
893 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
894 playlist_object = settings.get('playlistObject', {})
895 if playlist_object:
896 items = playlist_object.get('items')
897 if items and isinstance(items, list):
78f9d843
S
898 title = playlist_object['title']
899 description = playlist_object.get('summary')
baf39a1a
S
900 duration = int_or_none(items[0].get('duration'))
901 programme_id = items[0].get('vpid')
78f9d843
S
902 formats, subtitles = self._download_media_selector(programme_id)
903 self._sort_formats(formats)
904 entries.append({
905 'id': programme_id,
906 'title': title,
907 'description': description,
908 'timestamp': timestamp,
909 'duration': duration,
910 'formats': formats,
911 'subtitles': subtitles,
912 })
913 else:
914 # data-playable without vpid but with a playlist.sxml URLs
915 # in otherSettings.playlist (e.g.
916 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
917 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
918 if playlist:
a7e5f274
RA
919 entry = None
920 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
921 playlist_url = playlist.get('%sUrl' % key)
922 if not playlist_url:
923 continue
924 try:
a7e5f274
RA
925 info = self._extract_from_playlist_sxml(
926 playlist_url, playlist_id, timestamp)
927 if not entry:
928 entry = info
929 else:
930 entry['title'] = info['title']
931 entry['formats'].extend(info['formats'])
05087d1b
S
932 except Exception as e:
933 # Some playlist URL may fail with 500, at the same time
934 # the other one may work fine (e.g.
935 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
936 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
937 continue
938 raise
a7e5f274
RA
939 if entry:
940 self._sort_formats(entry['formats'])
941 entries.append(entry)
78f9d843
S
942
943 if entries:
78f9d843
S
944 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
945
373941c5
S
946 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
947 group_id = self._search_regex(
948 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
949 webpage, 'group id', default=None)
38d70284 950 if group_id:
373941c5
S
951 return self.url_result(
952 'https://www.bbc.co.uk/programmes/%s' % group_id,
953 ie=BBCCoUkIE.ie_key())
954
78f9d843
S
955 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
956 programme_id = self._search_regex(
a1cf3e38 957 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
958 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
959 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 960 webpage, 'vpid', default=None)
dab062fb 961
9afa1770
S
962 if programme_id:
963 formats, subtitles = self._download_media_selector(programme_id)
964 self._sort_formats(formats)
965 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
966 digital_data = self._parse_json(
967 self._search_regex(
968 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
969 programme_id, fatal=False)
970 page_info = digital_data.get('page', {}).get('pageInfo', {})
971 title = page_info.get('pageName') or self._og_search_title(webpage)
972 description = page_info.get('description') or self._og_search_description(webpage)
973 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
974 return {
975 'id': programme_id,
976 'title': title,
977 'description': description,
978 'timestamp': timestamp,
979 'formats': formats,
980 'subtitles': subtitles,
981 }
a3bfddfa 982
9fb64c04
S
983 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
984 # There are several setPayload calls may be present but the video
985 # seems to be always related to the first one
986 morph_payload = self._parse_json(
987 self._search_regex(
988 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
989 webpage, 'morph payload', default='{}'),
990 playlist_id, fatal=False)
991 if morph_payload:
992 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
993 for component in components:
994 if not isinstance(component, dict):
995 continue
996 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
997 if not lead_media:
998 continue
999 identifiers = lead_media.get('identifiers')
1000 if not identifiers or not isinstance(identifiers, dict):
1001 continue
1002 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1003 if not programme_id:
1004 continue
1005 title = lead_media.get('title') or self._og_search_title(webpage)
1006 formats, subtitles = self._download_media_selector(programme_id)
1007 self._sort_formats(formats)
1008 description = lead_media.get('summary')
1009 uploader = lead_media.get('masterBrand')
1010 uploader_id = lead_media.get('mid')
1011 duration = None
1012 duration_d = lead_media.get('duration')
1013 if isinstance(duration_d, dict):
1014 duration = parse_duration(dict_get(
1015 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1016 return {
1017 'id': programme_id,
1018 'title': title,
1019 'description': description,
1020 'duration': duration,
1021 'uploader': uploader,
1022 'uploader_id': uploader_id,
1023 'formats': formats,
1024 'subtitles': subtitles,
1025 }
1026
b96b4be4
RA
1027 preload_state = self._parse_json(self._search_regex(
1028 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1029 'preload state', default='{}'), playlist_id, fatal=False)
1030 if preload_state:
1031 current_programme = preload_state.get('programmes', {}).get('current') or {}
1032 programme_id = current_programme.get('id')
1033 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1034 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1035 formats, subtitles = self._download_media_selector(programme_id)
1036 self._sort_formats(formats)
1037 synopses = current_programme.get('synopses') or {}
1038 network = current_programme.get('network') or {}
1039 duration = int_or_none(
1040 current_programme.get('duration', {}).get('value'))
1041 thumbnail = None
1042 image_url = current_programme.get('image_url')
1043 if image_url:
1044 thumbnail = image_url.replace('{recipe}', '1920x1920')
1045 return {
1046 'id': programme_id,
1047 'title': title,
1048 'description': dict_get(synopses, ('long', 'medium', 'short')),
1049 'thumbnail': thumbnail,
1050 'duration': duration,
1051 'uploader': network.get('short_title'),
1052 'uploader_id': network.get('id'),
1053 'formats': formats,
1054 'subtitles': subtitles,
1055 }
1056
6d155707
S
1057 bbc3_config = self._parse_json(
1058 self._search_regex(
1059 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1060 'bbcthree config', default='{}'),
38d70284 1061 playlist_id, transform_source=js_to_json, fatal=False) or {}
1062 payload = bbc3_config.get('payload') or {}
1063 if payload:
1064 clip = payload.get('currentClip') or {}
1065 clip_vpid = clip.get('vpid')
1066 clip_title = clip.get('title')
1067 if clip_vpid and clip_title:
1068 formats, subtitles = self._download_media_selector(clip_vpid)
1069 self._sort_formats(formats)
1070 return {
1071 'id': clip_vpid,
1072 'title': clip_title,
1073 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1074 'description': clip.get('description'),
1075 'duration': parse_duration(clip.get('duration')),
1076 'formats': formats,
1077 'subtitles': subtitles,
1078 }
6d155707 1079 bbc3_playlist = try_get(
38d70284 1080 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1081 dict)
1082 if bbc3_playlist:
1083 playlist_title = bbc3_playlist.get('title') or playlist_title
1084 thumbnail = bbc3_playlist.get('holdingImageURL')
1085 entries = []
1086 for bbc3_item in bbc3_playlist['items']:
1087 programme_id = bbc3_item.get('versionID')
1088 if not programme_id:
1089 continue
1090 formats, subtitles = self._download_media_selector(programme_id)
1091 self._sort_formats(formats)
1092 entries.append({
1093 'id': programme_id,
1094 'title': playlist_title,
1095 'thumbnail': thumbnail,
1096 'timestamp': timestamp,
1097 'formats': formats,
1098 'subtitles': subtitles,
1099 })
1100 return self.playlist_result(
1101 entries, playlist_id, playlist_title, playlist_description)
1102
38d70284 1103 initial_data = self._parse_json(self._search_regex(
1104 r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
1105 'preload state', default='{}'), playlist_id, fatal=False)
1106 if initial_data:
1107 def parse_media(media):
1108 if not media:
1109 return
1110 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1111 item_id = item.get('id')
1112 item_title = item.get('title')
1113 if not (item_id and item_title):
1114 continue
1115 formats, subtitles = self._download_media_selector(item_id)
1116 self._sort_formats(formats)
1117 entries.append({
1118 'id': item_id,
1119 'title': item_title,
1120 'thumbnail': item.get('holdingImageUrl'),
1121 'formats': formats,
1122 'subtitles': subtitles,
1123 })
1124 for resp in (initial_data.get('data') or {}).values():
1125 name = resp.get('name')
1126 if name == 'media-experience':
1127 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1128 elif name == 'article':
1129 for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
1130 if block.get('type') != 'media':
1131 continue
1132 parse_media(block.get('model'))
1133 return self.playlist_result(
1134 entries, playlist_id, playlist_title, playlist_description)
1135
88ed52ae
S
1136 def extract_all(pattern):
1137 return list(filter(None, map(
1138 lambda s: self._parse_json(s, playlist_id, fatal=False),
1139 re.findall(pattern, webpage))))
1140
1141 # Multiple video article (e.g.
1142 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1143 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1144 entries = []
1145 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1146 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1147 if embed_url and re.match(EMBED_URL, embed_url):
1148 entries.append(embed_url)
1149 entries.extend(re.findall(
1150 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1151 if entries:
1152 return self.playlist_result(
aaa42cf0 1153 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1154 playlist_id, playlist_title, playlist_description)
9afa1770
S
1155
1156 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1157 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1158
1159 if not medias:
1160 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1161 media_asset = self._search_regex(
1162 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1163 webpage, 'media asset', default=None)
1164 if media_asset:
1165 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1166 medias = []
1167 for video in media_asset_page.get('videos', {}).values():
1168 medias.extend(video.values())
1169
1170 if not medias:
1171 # Multiple video playlist with single `now playing` entry (e.g.
1172 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1173 vxp_playlist = self._parse_json(
9afa1770 1174 self._search_regex(
a346b1ff
S
1175 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1176 webpage, 'playlist data'),
9afa1770 1177 playlist_id)
a346b1ff
S
1178 playlist_medias = []
1179 for item in vxp_playlist:
1180 media = item.get('media')
1181 if not media:
1182 continue
1183 playlist_medias.append(media)
1184 # Download single video if found media with asset id matching the video id from URL
1185 if item.get('advert', {}).get('assetId') == playlist_id:
1186 medias = [media]
1187 break
1188 # Fallback to the whole playlist
1189 if not medias:
1190 medias = playlist_medias
9afa1770
S
1191
1192 entries = []
1193 for num, media_meta in enumerate(medias, start=1):
1194 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1195 if not formats:
1196 continue
10273d6e 1197 self._sort_formats(formats)
1198
9afa1770
S
1199 video_id = media_meta.get('externalId')
1200 if not video_id:
1201 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1202
1203 title = media_meta.get('caption')
1204 if not title:
1205 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1206
1207 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1208
9afa1770
S
1209 images = []
1210 for image in media_meta.get('images', {}).values():
1211 images.extend(image.values())
1212 if 'image' in media_meta:
1213 images.append(media_meta['image'])
1214
1215 thumbnails = [{
1216 'url': image.get('href'),
1217 'width': int_or_none(image.get('width')),
1218 'height': int_or_none(image.get('height')),
1219 } for image in images]
1220
1221 entries.append({
1222 'id': video_id,
10273d6e 1223 'title': title,
9afa1770 1224 'thumbnails': thumbnails,
10273d6e 1225 'duration': duration,
9afa1770 1226 'timestamp': timestamp,
10273d6e 1227 'formats': formats,
1228 'subtitles': subtitles,
a3bfddfa 1229 })
10273d6e 1230
9afa1770 1231 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1232
1233
1234class BBCCoUkArticleIE(InfoExtractor):
92519402 1235 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1236 IE_NAME = 'bbc.co.uk:article'
1237 IE_DESC = 'BBC articles'
1238
1239 _TEST = {
1240 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1241 'info_dict': {
1242 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1243 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1244 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1245 },
1246 'playlist_count': 4,
1247 'add_ie': ['BBCCoUk'],
1248 }
1249
1250 def _real_extract(self, url):
1251 playlist_id = self._match_id(url)
1252
1253 webpage = self._download_webpage(url, playlist_id)
1254
1255 title = self._og_search_title(webpage)
1256 description = self._og_search_description(webpage).strip()
1257
1258 entries = [self.url_result(programme_url) for programme_url in re.findall(
1259 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1260
1261 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1262
1263
1264class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1265 def _entries(self, webpage, url, playlist_id):
1266 single_page = 'page' in compat_urlparse.parse_qs(
1267 compat_urlparse.urlparse(url).query)
1268 for page_num in itertools.count(2):
1269 for video_id in re.findall(
1270 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1271 yield self.url_result(
1272 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1273 if single_page:
1274 return
1275 next_page = self._search_regex(
1276 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1277 webpage, 'next page url', default=None, group='url')
1278 if not next_page:
1279 break
1280 webpage = self._download_webpage(
1281 compat_urlparse.urljoin(url, next_page), playlist_id,
1282 'Downloading page %d' % page_num, page_num)
1283
ded7511a
S
1284 def _real_extract(self, url):
1285 playlist_id = self._match_id(url)
1286
1287 webpage = self._download_webpage(url, playlist_id)
1288
ded7511a
S
1289 title, description = self._extract_title_and_description(webpage)
1290
254e64a2
S
1291 return self.playlist_result(
1292 self._entries(webpage, url, playlist_id),
1293 playlist_id, title, description)
ded7511a
S
1294
1295
1296class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1297 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1298 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1299 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1300 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1301 _TESTS = [{
ded7511a
S
1302 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1303 'info_dict': {
1304 'id': 'b05rcz9v',
1305 'title': 'The Disappearance',
1306 'description': 'French thriller serial about a missing teenager.',
1307 },
1308 'playlist_mincount': 6,
c6668e4a 1309 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1310 }, {
1311 # Available for over a year unlike 30 days for most other programmes
1312 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1313 'info_dict': {
1314 'id': 'p02tcc32',
1315 'title': 'Bohemian Icons',
1316 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1317 },
1318 'playlist_mincount': 10,
1319 }]
ded7511a
S
1320
1321 def _extract_title_and_description(self, webpage):
1322 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1323 description = self._search_regex(
1324 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1325 webpage, 'description', fatal=False, group='value')
1326 return title, description
1327
1328
1329class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1330 IE_NAME = 'bbc.co.uk:playlist'
1331 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1332 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1333 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1334 _TESTS = [{
1335 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1336 'info_dict': {
1337 'id': 'b05rcz9v',
1338 'title': 'The Disappearance - Clips - BBC Four',
1339 'description': 'French thriller serial about a missing teenager.',
1340 },
1341 'playlist_mincount': 7,
4f640f28
S
1342 }, {
1343 # multipage playlist, explicit page
1344 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1345 'info_dict': {
1346 'id': 'b00mfl7n',
1347 'title': 'Frozen Planet - Clips - BBC One',
1348 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1349 },
1350 'playlist_mincount': 24,
1351 }, {
1352 # multipage playlist, all pages
1353 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1354 'info_dict': {
1355 'id': 'b00mfl7n',
1356 'title': 'Frozen Planet - Clips - BBC One',
1357 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1358 },
1359 'playlist_mincount': 142,
ded7511a
S
1360 }, {
1361 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1362 'only_matching': True,
1363 }, {
1364 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1365 'only_matching': True,
1366 }, {
1367 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1368 'only_matching': True,
1369 }]
1370
1371 def _extract_title_and_description(self, webpage):
1372 title = self._og_search_title(webpage, fatal=False)
1373 description = self._og_search_description(webpage)
1374 return title, description