]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[facebook] fix tahoe request(closes #17171)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
6d155707 15 js_to_json,
9afa1770
S
16 parse_duration,
17 parse_iso8601,
9fb64c04 18 try_get,
dab062fb 19 unescapeHTML,
97067db2
S
20 urlencode_postdata,
21 urljoin,
8683b4d8 22)
36e6f62c 23from ..compat import (
36e6f62c 24 compat_HTTPError,
254e64a2 25 compat_urlparse,
36e6f62c 26)
082c6c86 27
d12a1a47 28
f13b1e7d 29class BBCCoUkIE(InfoExtractor):
082c6c86 30 IE_NAME = 'bbc.co.uk'
2e3fd9ec 31 IE_DESC = 'BBC iPlayer'
6f356cbb 32 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:www\.)?bbc\.co\.uk/
36 (?:
37 programmes/(?!articles/)|
38 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 39 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
40 radio/player/|
41 events/[^/]+/play/[^/]+/
f20a11ed 42 )
ded7511a 43 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 44 ''' % _ID_REGEX
082c6c86 45
97067db2
S
46 _LOGIN_URL = 'https://account.bbc.com/signin'
47 _NETRC_MACHINE = 'bbc'
48
d12a1a47 49 _MEDIASELECTOR_URLS = [
26ccc68b
S
50 # Provides HQ HLS streams with even better quality that pc mediaset but fails
51 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 52 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 53 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
54 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
55 ]
a8b081a0 56
e6174ee9
S
57 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
58 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
59
60 _NAMESPACES = (
61 _MEDIASELECTION_NS,
62 _EMP_PLAYLIST_NS,
63 )
64
2e3fd9ec
S
65 _TESTS = [
66 {
f2d0fc68 67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 68 'info_dict': {
f2d0fc68 69 'id': 'b039d07m',
b1ea6802 70 'ext': 'flv',
679bacf0 71 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
73 },
74 'params': {
b1ea6802 75 # rtmp download
2e3fd9ec
S
76 'skip_download': True,
77 }
082c6c86 78 },
2e3fd9ec
S
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
c7f0177f
S
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
17968e44 99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 101 'duration': 5100,
2e3fd9ec
S
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
b1ea6802 107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
b1ea6802 122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
a3ef0e1c
YCH
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 137 }, {
5aa535c3 138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
139 'note': 'Audio',
140 'info_dict': {
5aa535c3 141 'id': 'p022h44j',
b1ea6802 142 'ext': 'flv',
5aa535c3
S
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
c7e67594
S
146 },
147 'params': {
b1ea6802 148 # rtmp download
c7e67594
S
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
b1ea6802 156 'ext': 'flv',
c7e67594
S
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
b1ea6802 162 # rtmp download
c7e67594
S
163 'skip_download': True,
164 }
e68ae99a
S
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
b1ea6802 178 'skip': 'geolocation',
25fa8d66
YCH
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
b1ea6802 192 'skip': 'geolocation',
54914380
S
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
5aa535c3 196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 197 'info_dict': {
5aa535c3 198 'id': 'b06rkms3',
54914380 199 'ext': 'flv',
5aa535c3
S
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
b1ea6802 207 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
208 }, {
209 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
b1ea6802 213 'ext': 'flv',
1ac6e794
S
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
b1ea6802 218 # rtmp download
1ac6e794
S
219 'skip_download': True,
220 },
31763975
S
221 }, {
222 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
223 'only_matching': True,
c7e67594
S
224 }, {
225 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
226 'only_matching': True,
0692ef86
S
227 }, {
228 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
229 'only_matching': True,
f20a11ed
S
230 }, {
231 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
232 'only_matching': True,
72d256c4
S
233 }, {
234 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
235 'only_matching': True,
53647dfd
S
236 }, {
237 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
238 'only_matching': True,
6f356cbb
S
239 }, {
240 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
241 'only_matching': True,
242 }, {
243 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
244 'only_matching': True,
72d256c4 245 }]
2e3fd9ec 246
97eb9bd2
RA
247 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
248
97067db2
S
249 def _login(self):
250 username, password = self._get_login_info()
251 if username is None:
252 return
253
254 login_page = self._download_webpage(
255 self._LOGIN_URL, None, 'Downloading signin page')
256
257 login_form = self._hidden_inputs(login_page)
258
259 login_form.update({
260 'username': username,
261 'password': password,
262 })
263
264 post_url = urljoin(self._LOGIN_URL, self._search_regex(
265 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
266 'post url', default=self._LOGIN_URL, group='url'))
267
268 response, urlh = self._download_webpage_handle(
269 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
270 headers={'Referer': self._LOGIN_URL})
271
272 if self._LOGIN_URL in urlh.geturl():
273 error = clean_html(get_element_by_class('form-message', response))
274 if error:
275 raise ExtractorError(
276 'Unable to login: %s' % error, expected=True)
277 raise ExtractorError('Unable to log in')
278
279 def _real_initialize(self):
280 self._login()
281
d12a1a47
S
282 class MediaSelectionError(Exception):
283 def __init__(self, id):
284 self.id = id
285
2e3fd9ec
S
286 def _extract_asx_playlist(self, connection, programme_id):
287 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
288 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
289
2e3fd9ec 290 def _extract_items(self, playlist):
e6174ee9
S
291 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
292
293 def _findall_ns(self, element, xpath):
294 elements = []
295 for ns in self._NAMESPACES:
296 elements.extend(element.findall(xpath % ns))
297 return elements
2e3fd9ec
S
298
299 def _extract_medias(self, media_selection):
e6174ee9
S
300 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
301 if error is None:
302 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 303 if error is not None:
d12a1a47 304 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 305 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
306
307 def _extract_connections(self, media):
e6174ee9 308 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 309
f13b1e7d 310 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
311 subtitles = {}
312 for connection in self._extract_connections(media):
313 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
314 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
315 subtitles[lang] = [
316 {
317 'url': connection.get('href'),
318 'ext': 'ttml',
319 },
f13b1e7d 320 ]
2e3fd9ec 321 return subtitles
082c6c86 322
d12a1a47
S
323 def _raise_extractor_error(self, media_selection_error):
324 raise ExtractorError(
325 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
326 expected=True)
327
c056efa2 328 def _download_media_selector(self, programme_id):
d12a1a47
S
329 last_exception = None
330 for mediaselector_url in self._MEDIASELECTOR_URLS:
331 try:
332 return self._download_media_selector_url(
333 mediaselector_url % programme_id, programme_id)
334 except BBCCoUkIE.MediaSelectionError as e:
d781e293 335 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
336 last_exception = e
337 continue
338 self._raise_extractor_error(e)
339 self._raise_extractor_error(last_exception)
9afa1770
S
340
341 def _download_media_selector_url(self, url, programme_id=None):
9283d4ea
S
342 media_selection = self._download_xml(
343 url, programme_id, 'Downloading media selection XML',
344 expected_status=(403, 404))
9afa1770 345 return self._process_media_selector(media_selection, programme_id)
082c6c86 346
9afa1770 347 def _process_media_selector(self, media_selection, programme_id):
082c6c86 348 formats = []
2e3fd9ec 349 subtitles = None
b0af1215 350 urls = []
2e3fd9ec 351
c056efa2
S
352 for media in self._extract_medias(media_selection):
353 kind = media.get('kind')
a7e5f274
RA
354 if kind in ('video', 'audio'):
355 bitrate = int_or_none(media.get('bitrate'))
356 encoding = media.get('encoding')
357 service = media.get('service')
358 width = int_or_none(media.get('width'))
359 height = int_or_none(media.get('height'))
360 file_size = int_or_none(media.get('media_file_size'))
361 for connection in self._extract_connections(media):
b0af1215
RA
362 href = connection.get('href')
363 if href in urls:
364 continue
365 if href:
366 urls.append(href)
a7e5f274
RA
367 conn_kind = connection.get('kind')
368 protocol = connection.get('protocol')
369 supplier = connection.get('supplier')
a7e5f274
RA
370 transfer_format = connection.get('transferFormat')
371 format_id = supplier or conn_kind or protocol
372 if service:
373 format_id = '%s_%s' % (service, format_id)
374 # ASX playlist
375 if supplier == 'asx':
376 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
377 formats.append({
378 'url': ref,
379 'format_id': 'ref%s_%s' % (i, format_id),
380 })
381 elif transfer_format == 'dash':
382 formats.extend(self._extract_mpd_formats(
383 href, programme_id, mpd_id=format_id, fatal=False))
384 elif transfer_format == 'hls':
385 formats.extend(self._extract_m3u8_formats(
386 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
387 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
388 if re.search(self._USP_RE, href):
389 usp_formats = self._extract_m3u8_formats(
6b2d8c91 390 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
97eb9bd2
RA
391 programme_id, ext='mp4', entry_protocol='m3u8_native',
392 m3u8_id=format_id, fatal=False)
393 for f in usp_formats:
394 if f.get('height') and f['height'] > 720:
395 continue
396 formats.append(f)
a7e5f274
RA
397 elif transfer_format == 'hds':
398 formats.extend(self._extract_f4m_formats(
399 href, programme_id, f4m_id=format_id, fatal=False))
400 else:
f9622868 401 if not service and not supplier and bitrate:
aaa42cf0 402 format_id += '-%d' % bitrate
a7e5f274
RA
403 fmt = {
404 'format_id': format_id,
405 'filesize': file_size,
406 }
407 if kind == 'video':
408 fmt.update({
409 'width': width,
410 'height': height,
6240925b 411 'tbr': bitrate,
a7e5f274
RA
412 'vcodec': encoding,
413 })
414 else:
415 fmt.update({
416 'abr': bitrate,
417 'acodec': encoding,
418 'vcodec': 'none',
419 })
1af959ef 420 if protocol in ('http', 'https'):
a7e5f274
RA
421 # Direct link
422 fmt.update({
423 'url': href,
424 })
425 elif protocol == 'rtmp':
426 application = connection.get('application', 'ondemand')
427 auth_string = connection.get('authString')
428 identifier = connection.get('identifier')
429 server = connection.get('server')
430 fmt.update({
431 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
432 'play_path': identifier,
433 'app': '%s?%s' % (application, auth_string),
434 'page_url': 'http://www.bbc.co.uk',
435 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
436 'rtmp_live': False,
437 'ext': 'flv',
438 })
964744af
S
439 else:
440 continue
a7e5f274 441 formats.append(fmt)
c056efa2 442 elif kind == 'captions':
f13b1e7d 443 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 444 return formats, subtitles
2e3fd9ec 445
ae6986fb
S
446 def _download_playlist(self, playlist_id):
447 try:
448 playlist = self._download_json(
449 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
450 playlist_id, 'Downloading playlist JSON')
451
452 version = playlist.get('defaultAvailableVersion')
453 if version:
454 smp_config = version['smpConfig']
455 title = smp_config['title']
456 description = smp_config['summary']
457 for item in smp_config['items']:
458 kind = item['kind']
40fcba5e 459 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
460 continue
461 programme_id = item.get('vpid')
d97f5cd7 462 duration = int_or_none(item.get('duration'))
ae6986fb
S
463 formats, subtitles = self._download_media_selector(programme_id)
464 return programme_id, title, description, duration, formats, subtitles
465 except ExtractorError as ee:
f813928e 466 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
467 raise
468
469 # fallback to legacy playlist
9afa1770
S
470 return self._process_legacy_playlist(playlist_id)
471
472 def _process_legacy_playlist_url(self, url, display_id):
473 playlist = self._download_legacy_playlist_url(url, display_id)
474 return self._extract_from_legacy_playlist(playlist, display_id)
475
476 def _process_legacy_playlist(self, playlist_id):
477 return self._process_legacy_playlist_url(
478 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
479
480 def _download_legacy_playlist_url(self, url, playlist_id=None):
481 return self._download_xml(
482 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 483
9afa1770 484 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 485 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
486 if no_items is not None:
487 reason = no_items.get('reason')
488 if reason == 'preAvailability':
489 msg = 'Episode %s is not yet available' % playlist_id
490 elif reason == 'postAvailability':
491 msg = 'Episode %s is no longer available' % playlist_id
492 elif reason == 'noMedia':
493 msg = 'Episode %s is not currently available' % playlist_id
494 else:
495 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
496 raise ExtractorError(msg, expected=True)
497
498 for item in self._extract_items(playlist):
499 kind = item.get('kind')
40fcba5e 500 if kind not in ('programme', 'radioProgramme'):
ae6986fb 501 continue
e6174ee9
S
502 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
503 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 504 description = description_el.text if description_el is not None else None
9afa1770
S
505
506 def get_programme_id(item):
507 def get_from_attributes(item):
508 for p in('identifier', 'group'):
509 value = item.get(p)
510 if value and re.match(r'^[pb][\da-z]{7}$', value):
511 return value
512 get_from_attributes(item)
e6174ee9 513 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
514 if mediator is not None:
515 return get_from_attributes(mediator)
516
517 programme_id = get_programme_id(item)
d97f5cd7 518 duration = int_or_none(item.get('duration'))
e6174ee9
S
519
520 if programme_id:
521 formats, subtitles = self._download_media_selector(programme_id)
522 else:
523 formats, subtitles = self._process_media_selector(item, playlist_id)
524 programme_id = playlist_id
ae6986fb
S
525
526 return programme_id, title, description, duration, formats, subtitles
527
c056efa2
S
528 def _real_extract(self, url):
529 group_id = self._match_id(url)
530
531 webpage = self._download_webpage(url, group_id, 'Downloading video page')
532
b2ed954f
S
533 error = self._search_regex(
534 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
535 webpage, 'error', default=None)
536 if error:
537 raise ExtractorError(error, expected=True)
538
8683b4d8 539 programme_id = None
679bacf0 540 duration = None
8683b4d8
S
541
542 tviplayer = self._search_regex(
543 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
544 webpage, 'player', default=None)
545
546 if tviplayer:
547 player = self._parse_json(tviplayer, group_id).get('player', {})
548 duration = int_or_none(player.get('duration'))
549 programme_id = player.get('vpid')
550
551 if not programme_id:
552 programme_id = self._search_regex(
22d7368d 553 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 554
c056efa2 555 if programme_id:
c056efa2 556 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 557 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
558 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
559 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 560 description = self._search_regex(
a8534274
S
561 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
562 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
563 webpage, 'description', default=None)
564 if not description:
565 description = self._html_search_meta('description', webpage)
c056efa2 566 else:
ae6986fb 567 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 568
082c6c86
S
569 self._sort_formats(formats)
570
571 return {
2e3fd9ec 572 'id': programme_id,
082c6c86
S
573 'title': title,
574 'description': description,
650cfd0c 575 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
576 'duration': duration,
577 'formats': formats,
2e3fd9ec 578 'subtitles': subtitles,
5f6a1245 579 }
10273d6e 580
581
9afa1770
S
582class BBCIE(BBCCoUkIE):
583 IE_NAME = 'bbc'
584 IE_DESC = 'BBC'
585 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 586
d12a1a47 587 _MEDIASELECTOR_URLS = [
55ebae26
S
588 # Provides HQ HLS streams but fails with geolocation in some cases when it's
589 # even not geo restricted at all
590 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
591 # Provides more formats, namely direct mp4 links, but fails on some videos with
592 # notukerror for non UK (?) users (e.g.
593 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
594 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
595 # Provides fewer formats, but works everywhere for everybody (hopefully)
596 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
597 ]
10273d6e 598
599 _TESTS = [{
6a747190 600 # article with multiple videos embedded with data-playable containing vpids
10273d6e 601 'url': 'http://www.bbc.com/news/world-europe-32668511',
602 'info_dict': {
603 'id': 'world-europe-32668511',
604 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 605 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 606 },
607 'playlist_count': 2,
a3bfddfa 608 }, {
6a747190 609 # article with multiple videos embedded with data-playable (more videos)
10273d6e 610 'url': 'http://www.bbc.com/news/business-28299555',
611 'info_dict': {
612 'id': 'business-28299555',
613 'title': 'Farnborough Airshow: Video highlights',
9afa1770 614 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 615 },
616 'playlist_count': 9,
9afa1770 617 'skip': 'Save time',
88ed52ae
S
618 }, {
619 # article with multiple videos embedded with `new SMP()`
6a747190 620 # broken
88ed52ae
S
621 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
622 'info_dict': {
623 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 624 'title': 'BUGGER',
88ed52ae
S
625 },
626 'playlist_count': 18,
a3bfddfa 627 }, {
6a747190 628 # single video embedded with data-playable containing vpid
10273d6e 629 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 630 'info_dict': {
631 'id': 'p02mprgb',
55ebae26 632 'ext': 'mp4',
10273d6e 633 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 634 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 635 'duration': 47,
9afa1770 636 'timestamp': 1427219242,
da92eeae 637 'upload_date': '20150324',
10273d6e 638 },
639 'params': {
9afa1770 640 # rtmp download
10273d6e 641 'skip_download': True,
642 }
a3bfddfa 643 }, {
6a747190
S
644 # article with single video embedded with data-playable containing XML playlist
645 # with direct video links as progressiveDownloadUrl (for now these are extracted)
646 # and playlist with f4m and m3u8 as streamingUrl
de939d89 647 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 648 'info_dict': {
9afa1770 649 'id': '150615_telabyad_kentin_cogu',
de939d89 650 'ext': 'mp4',
ad152e2d 651 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 652 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 653 'timestamp': 1434397334,
da92eeae 654 'upload_date': '20150615',
de939d89 655 },
656 'params': {
657 'skip_download': True,
658 }
c936d8cc 659 }, {
6a747190 660 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 661 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 662 'info_dict': {
9afa1770 663 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 664 'ext': 'mp4',
9afa1770 665 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 666 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 667 'timestamp': 1434713142,
da92eeae 668 'upload_date': '20150619',
de939d89 669 },
670 'params': {
671 'skip_download': True,
672 }
a346b1ff
S
673 }, {
674 # single video from video playlist embedded with vxp-playlist-data JSON
675 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
676 'info_dict': {
677 'id': 'p02w6qjc',
55ebae26 678 'ext': 'mp4',
a346b1ff
S
679 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
680 'duration': 56,
0bc4ee60 681 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
682 },
683 'params': {
684 'skip_download': True,
685 }
9afa1770
S
686 }, {
687 # single video story with digitalData
688 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
689 'info_dict': {
690 'id': 'p02q6gc4',
691 'ext': 'flv',
692 'title': 'Sri Lanka’s spicy secret',
693 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
694 'timestamp': 1437674293,
695 'upload_date': '20150723',
696 },
697 'params': {
698 # rtmp download
699 'skip_download': True,
700 }
701 }, {
702 # single video story without digitalData
703 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
704 'info_dict': {
705 'id': 'p018zqqg',
55ebae26 706 'ext': 'mp4',
9afa1770
S
707 'title': 'Hyundai Santa Fe Sport: Rock star',
708 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
709 'timestamp': 1415867444,
710 'upload_date': '20141113',
9afa1770
S
711 },
712 'params': {
713 # rtmp download
714 'skip_download': True,
715 }
9fb64c04
S
716 }, {
717 # single video embedded with Morph
718 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
719 'info_dict': {
720 'id': 'p041vhd0',
721 'ext': 'mp4',
722 'title': "Nigeria v Japan - Men's First Round",
723 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
724 'duration': 7980,
725 'uploader': 'BBC Sport',
726 'uploader_id': 'bbc_sport',
727 },
728 'params': {
729 # m3u8 download
730 'skip_download': True,
9fb64c04
S
731 },
732 'skip': 'Georestricted to UK',
9afa1770 733 }, {
6a747190 734 # single video with playlist.sxml URL in playlist param
9afa1770
S
735 'url': 'http://www.bbc.com/sport/0/football/33653409',
736 'info_dict': {
737 'id': 'p02xycnp',
55ebae26 738 'ext': 'mp4',
9afa1770 739 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 740 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
741 'duration': 140,
742 },
743 'params': {
744 # rtmp download
745 'skip_download': True,
746 }
b5d48cb1 747 }, {
6a747190 748 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
749 'url': 'http://www.bbc.com/sport/0/football/34475836',
750 'info_dict': {
751 'id': '34475836',
450b233c 752 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 753 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
754 },
755 'playlist_count': 3,
450b233c
S
756 }, {
757 # school report article with single video
758 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
759 'info_dict': {
760 'id': '35744779',
761 'title': 'School which breaks down barriers in Jerusalem',
762 },
763 'playlist_count': 1,
9afa1770
S
764 }, {
765 # single video with playlist URL from weather section
766 'url': 'http://www.bbc.com/weather/features/33601775',
767 'only_matching': True,
768 }, {
769 # custom redirection to www.bbc.com
770 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
771 'only_matching': True,
a1cf3e38
S
772 }, {
773 # single video article embedded with data-media-vpid
774 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
775 'only_matching': True,
6d155707
S
776 }, {
777 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
778 'info_dict': {
779 'id': 'p06556y7',
780 'ext': 'mp4',
781 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
782 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
783 },
784 'params': {
785 'skip_download': True,
786 }
b96b4be4
RA
787 }, {
788 # window.__PRELOADED_STATE__
789 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
790 'info_dict': {
791 'id': 'b0b9z4vz',
792 'ext': 'mp4',
793 'title': 'Prom 6: An American in Paris and Turangalila',
794 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
795 'uploader': 'Radio 3',
796 'uploader_id': 'bbc_radio_three',
797 },
10273d6e 798 }]
799
9afa1770
S
800 @classmethod
801 def suitable(cls, url):
ded7511a
S
802 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
803 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
804 else super(BBCIE, cls).suitable(url))
9afa1770
S
805
806 def _extract_from_media_meta(self, media_meta, video_id):
807 # Direct links to media in media metadata (e.g.
808 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
809 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
810 source_files = media_meta.get('sourceFiles')
811 if source_files:
812 return [{
813 'url': f['url'],
814 'format_id': format_id,
815 'ext': f.get('encoding'),
816 'tbr': float_or_none(f.get('bitrate'), 1000),
817 'filesize': int_or_none(f.get('filesize')),
818 } for format_id, f in source_files.items() if f.get('url')], []
819
820 programme_id = media_meta.get('externalId')
821 if programme_id:
822 return self._download_media_selector(programme_id)
823
824 # Process playlist.sxml as legacy playlist
825 href = media_meta.get('href')
826 if href:
827 playlist = self._download_legacy_playlist_url(href)
828 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
829 return formats, subtitles
830
831 return [], []
832
baf39a1a
S
833 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
834 programme_id, title, description, duration, formats, subtitles = \
835 self._process_legacy_playlist_url(url, playlist_id)
836 self._sort_formats(formats)
837 return {
838 'id': programme_id,
839 'title': title,
840 'description': description,
841 'duration': duration,
842 'timestamp': timestamp,
843 'formats': formats,
844 'subtitles': subtitles,
845 }
846
10273d6e 847 def _real_extract(self, url):
9afa1770
S
848 playlist_id = self._match_id(url)
849
850 webpage = self._download_webpage(url, playlist_id)
851
522f6c06 852 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 853 timestamp = json_ld_info.get('timestamp')
0e832c2c 854
350e02d4 855 playlist_title = json_ld_info.get('title')
0e832c2c
S
856 if not playlist_title:
857 playlist_title = self._og_search_title(
858 webpage, default=None) or self._html_search_regex(
859 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
860 if playlist_title:
861 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
862
863 playlist_description = json_ld_info.get(
864 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
865
866 if not timestamp:
867 timestamp = parse_iso8601(self._search_regex(
868 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
869 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 870 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 871 webpage, 'date', default=None))
9afa1770 872
78f9d843
S
873 entries = []
874
de665713
S
875 # article with multiple videos embedded with playlist.sxml (e.g.
876 # http://www.bbc.com/sport/0/football/34475836)
877 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 878 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 879 if playlists:
baf39a1a
S
880 entries = [
881 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
882 for playlist_url in playlists]
de939d89 883
78f9d843
S
884 # news article with multiple videos embedded with data-playable
885 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
886 if data_playables:
887 for _, data_playable_json in data_playables:
888 data_playable = self._parse_json(
889 unescapeHTML(data_playable_json), playlist_id, fatal=False)
890 if not data_playable:
891 continue
baf39a1a
S
892 settings = data_playable.get('settings', {})
893 if settings:
78f9d843
S
894 # data-playable with video vpid in settings.playlistObject.items (e.g.
895 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
896 playlist_object = settings.get('playlistObject', {})
897 if playlist_object:
898 items = playlist_object.get('items')
899 if items and isinstance(items, list):
78f9d843
S
900 title = playlist_object['title']
901 description = playlist_object.get('summary')
baf39a1a
S
902 duration = int_or_none(items[0].get('duration'))
903 programme_id = items[0].get('vpid')
78f9d843
S
904 formats, subtitles = self._download_media_selector(programme_id)
905 self._sort_formats(formats)
906 entries.append({
907 'id': programme_id,
908 'title': title,
909 'description': description,
910 'timestamp': timestamp,
911 'duration': duration,
912 'formats': formats,
913 'subtitles': subtitles,
914 })
915 else:
916 # data-playable without vpid but with a playlist.sxml URLs
917 # in otherSettings.playlist (e.g.
918 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
919 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
920 if playlist:
a7e5f274
RA
921 entry = None
922 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
923 playlist_url = playlist.get('%sUrl' % key)
924 if not playlist_url:
925 continue
926 try:
a7e5f274
RA
927 info = self._extract_from_playlist_sxml(
928 playlist_url, playlist_id, timestamp)
929 if not entry:
930 entry = info
931 else:
932 entry['title'] = info['title']
933 entry['formats'].extend(info['formats'])
05087d1b
S
934 except Exception as e:
935 # Some playlist URL may fail with 500, at the same time
936 # the other one may work fine (e.g.
937 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
938 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
939 continue
940 raise
a7e5f274
RA
941 if entry:
942 self._sort_formats(entry['formats'])
943 entries.append(entry)
78f9d843
S
944
945 if entries:
78f9d843
S
946 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
947
948 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
949 programme_id = self._search_regex(
a1cf3e38 950 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
951 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
952 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 953 webpage, 'vpid', default=None)
dab062fb 954
9afa1770
S
955 if programme_id:
956 formats, subtitles = self._download_media_selector(programme_id)
957 self._sort_formats(formats)
958 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
959 digital_data = self._parse_json(
960 self._search_regex(
961 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
962 programme_id, fatal=False)
963 page_info = digital_data.get('page', {}).get('pageInfo', {})
964 title = page_info.get('pageName') or self._og_search_title(webpage)
965 description = page_info.get('description') or self._og_search_description(webpage)
966 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
967 return {
968 'id': programme_id,
969 'title': title,
970 'description': description,
971 'timestamp': timestamp,
972 'formats': formats,
973 'subtitles': subtitles,
974 }
a3bfddfa 975
9fb64c04
S
976 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
977 # There are several setPayload calls may be present but the video
978 # seems to be always related to the first one
979 morph_payload = self._parse_json(
980 self._search_regex(
981 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
982 webpage, 'morph payload', default='{}'),
983 playlist_id, fatal=False)
984 if morph_payload:
985 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
986 for component in components:
987 if not isinstance(component, dict):
988 continue
989 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
990 if not lead_media:
991 continue
992 identifiers = lead_media.get('identifiers')
993 if not identifiers or not isinstance(identifiers, dict):
994 continue
995 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
996 if not programme_id:
997 continue
998 title = lead_media.get('title') or self._og_search_title(webpage)
999 formats, subtitles = self._download_media_selector(programme_id)
1000 self._sort_formats(formats)
1001 description = lead_media.get('summary')
1002 uploader = lead_media.get('masterBrand')
1003 uploader_id = lead_media.get('mid')
1004 duration = None
1005 duration_d = lead_media.get('duration')
1006 if isinstance(duration_d, dict):
1007 duration = parse_duration(dict_get(
1008 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1009 return {
1010 'id': programme_id,
1011 'title': title,
1012 'description': description,
1013 'duration': duration,
1014 'uploader': uploader,
1015 'uploader_id': uploader_id,
1016 'formats': formats,
1017 'subtitles': subtitles,
1018 }
1019
b96b4be4
RA
1020 preload_state = self._parse_json(self._search_regex(
1021 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1022 'preload state', default='{}'), playlist_id, fatal=False)
1023 if preload_state:
1024 current_programme = preload_state.get('programmes', {}).get('current') or {}
1025 programme_id = current_programme.get('id')
1026 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1027 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1028 formats, subtitles = self._download_media_selector(programme_id)
1029 self._sort_formats(formats)
1030 synopses = current_programme.get('synopses') or {}
1031 network = current_programme.get('network') or {}
1032 duration = int_or_none(
1033 current_programme.get('duration', {}).get('value'))
1034 thumbnail = None
1035 image_url = current_programme.get('image_url')
1036 if image_url:
1037 thumbnail = image_url.replace('{recipe}', '1920x1920')
1038 return {
1039 'id': programme_id,
1040 'title': title,
1041 'description': dict_get(synopses, ('long', 'medium', 'short')),
1042 'thumbnail': thumbnail,
1043 'duration': duration,
1044 'uploader': network.get('short_title'),
1045 'uploader_id': network.get('id'),
1046 'formats': formats,
1047 'subtitles': subtitles,
1048 }
1049
6d155707
S
1050 bbc3_config = self._parse_json(
1051 self._search_regex(
1052 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1053 'bbcthree config', default='{}'),
1054 playlist_id, transform_source=js_to_json, fatal=False)
1055 if bbc3_config:
1056 bbc3_playlist = try_get(
1057 bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
1058 dict)
1059 if bbc3_playlist:
1060 playlist_title = bbc3_playlist.get('title') or playlist_title
1061 thumbnail = bbc3_playlist.get('holdingImageURL')
1062 entries = []
1063 for bbc3_item in bbc3_playlist['items']:
1064 programme_id = bbc3_item.get('versionID')
1065 if not programme_id:
1066 continue
1067 formats, subtitles = self._download_media_selector(programme_id)
1068 self._sort_formats(formats)
1069 entries.append({
1070 'id': programme_id,
1071 'title': playlist_title,
1072 'thumbnail': thumbnail,
1073 'timestamp': timestamp,
1074 'formats': formats,
1075 'subtitles': subtitles,
1076 })
1077 return self.playlist_result(
1078 entries, playlist_id, playlist_title, playlist_description)
1079
88ed52ae
S
1080 def extract_all(pattern):
1081 return list(filter(None, map(
1082 lambda s: self._parse_json(s, playlist_id, fatal=False),
1083 re.findall(pattern, webpage))))
1084
1085 # Multiple video article (e.g.
1086 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1087 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1088 entries = []
1089 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1090 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1091 if embed_url and re.match(EMBED_URL, embed_url):
1092 entries.append(embed_url)
1093 entries.extend(re.findall(
1094 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1095 if entries:
1096 return self.playlist_result(
aaa42cf0 1097 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1098 playlist_id, playlist_title, playlist_description)
9afa1770
S
1099
1100 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1101 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1102
1103 if not medias:
1104 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1105 media_asset = self._search_regex(
1106 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1107 webpage, 'media asset', default=None)
1108 if media_asset:
1109 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1110 medias = []
1111 for video in media_asset_page.get('videos', {}).values():
1112 medias.extend(video.values())
1113
1114 if not medias:
1115 # Multiple video playlist with single `now playing` entry (e.g.
1116 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1117 vxp_playlist = self._parse_json(
9afa1770 1118 self._search_regex(
a346b1ff
S
1119 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1120 webpage, 'playlist data'),
9afa1770 1121 playlist_id)
a346b1ff
S
1122 playlist_medias = []
1123 for item in vxp_playlist:
1124 media = item.get('media')
1125 if not media:
1126 continue
1127 playlist_medias.append(media)
1128 # Download single video if found media with asset id matching the video id from URL
1129 if item.get('advert', {}).get('assetId') == playlist_id:
1130 medias = [media]
1131 break
1132 # Fallback to the whole playlist
1133 if not medias:
1134 medias = playlist_medias
9afa1770
S
1135
1136 entries = []
1137 for num, media_meta in enumerate(medias, start=1):
1138 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1139 if not formats:
1140 continue
10273d6e 1141 self._sort_formats(formats)
1142
9afa1770
S
1143 video_id = media_meta.get('externalId')
1144 if not video_id:
1145 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1146
1147 title = media_meta.get('caption')
1148 if not title:
1149 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1150
1151 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1152
9afa1770
S
1153 images = []
1154 for image in media_meta.get('images', {}).values():
1155 images.extend(image.values())
1156 if 'image' in media_meta:
1157 images.append(media_meta['image'])
1158
1159 thumbnails = [{
1160 'url': image.get('href'),
1161 'width': int_or_none(image.get('width')),
1162 'height': int_or_none(image.get('height')),
1163 } for image in images]
1164
1165 entries.append({
1166 'id': video_id,
10273d6e 1167 'title': title,
9afa1770 1168 'thumbnails': thumbnails,
10273d6e 1169 'duration': duration,
9afa1770 1170 'timestamp': timestamp,
10273d6e 1171 'formats': formats,
1172 'subtitles': subtitles,
a3bfddfa 1173 })
10273d6e 1174
9afa1770 1175 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1176
1177
1178class BBCCoUkArticleIE(InfoExtractor):
92519402 1179 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1180 IE_NAME = 'bbc.co.uk:article'
1181 IE_DESC = 'BBC articles'
1182
1183 _TEST = {
1184 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1185 'info_dict': {
1186 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1187 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1188 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1189 },
1190 'playlist_count': 4,
1191 'add_ie': ['BBCCoUk'],
1192 }
1193
1194 def _real_extract(self, url):
1195 playlist_id = self._match_id(url)
1196
1197 webpage = self._download_webpage(url, playlist_id)
1198
1199 title = self._og_search_title(webpage)
1200 description = self._og_search_description(webpage).strip()
1201
1202 entries = [self.url_result(programme_url) for programme_url in re.findall(
1203 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1204
1205 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1206
1207
1208class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1209 def _entries(self, webpage, url, playlist_id):
1210 single_page = 'page' in compat_urlparse.parse_qs(
1211 compat_urlparse.urlparse(url).query)
1212 for page_num in itertools.count(2):
1213 for video_id in re.findall(
1214 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1215 yield self.url_result(
1216 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1217 if single_page:
1218 return
1219 next_page = self._search_regex(
1220 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1221 webpage, 'next page url', default=None, group='url')
1222 if not next_page:
1223 break
1224 webpage = self._download_webpage(
1225 compat_urlparse.urljoin(url, next_page), playlist_id,
1226 'Downloading page %d' % page_num, page_num)
1227
ded7511a
S
1228 def _real_extract(self, url):
1229 playlist_id = self._match_id(url)
1230
1231 webpage = self._download_webpage(url, playlist_id)
1232
ded7511a
S
1233 title, description = self._extract_title_and_description(webpage)
1234
254e64a2
S
1235 return self.playlist_result(
1236 self._entries(webpage, url, playlist_id),
1237 playlist_id, title, description)
ded7511a
S
1238
1239
1240class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1241 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1242 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1243 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1244 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1245 _TESTS = [{
ded7511a
S
1246 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1247 'info_dict': {
1248 'id': 'b05rcz9v',
1249 'title': 'The Disappearance',
1250 'description': 'French thriller serial about a missing teenager.',
1251 },
1252 'playlist_mincount': 6,
c6668e4a 1253 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1254 }, {
1255 # Available for over a year unlike 30 days for most other programmes
1256 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1257 'info_dict': {
1258 'id': 'p02tcc32',
1259 'title': 'Bohemian Icons',
1260 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1261 },
1262 'playlist_mincount': 10,
1263 }]
ded7511a
S
1264
1265 def _extract_title_and_description(self, webpage):
1266 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1267 description = self._search_regex(
1268 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1269 webpage, 'description', fatal=False, group='value')
1270 return title, description
1271
1272
1273class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1274 IE_NAME = 'bbc.co.uk:playlist'
1275 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1276 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1277 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1278 _TESTS = [{
1279 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1280 'info_dict': {
1281 'id': 'b05rcz9v',
1282 'title': 'The Disappearance - Clips - BBC Four',
1283 'description': 'French thriller serial about a missing teenager.',
1284 },
1285 'playlist_mincount': 7,
4f640f28
S
1286 }, {
1287 # multipage playlist, explicit page
1288 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1289 'info_dict': {
1290 'id': 'b00mfl7n',
1291 'title': 'Frozen Planet - Clips - BBC One',
1292 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1293 },
1294 'playlist_mincount': 24,
1295 }, {
1296 # multipage playlist, all pages
1297 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1298 'info_dict': {
1299 'id': 'b00mfl7n',
1300 'title': 'Frozen Planet - Clips - BBC One',
1301 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1302 },
1303 'playlist_mincount': 142,
ded7511a
S
1304 }, {
1305 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1306 'only_matching': True,
1307 }, {
1308 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1309 'only_matching': True,
1310 }, {
1311 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1312 'only_matching': True,
1313 }]
1314
1315 def _extract_title_and_description(self, webpage):
1316 title = self._og_search_title(webpage, fatal=False)
1317 description = self._og_search_description(webpage)
1318 return title, description