]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
[fragments] Pad fragments before decrypting (#1298)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
1418a043 4import functools
254e64a2 5import itertools
1418a043 6import json
f0228f56 7import re
082c6c86 8
f13b1e7d 9from .common import InfoExtractor
3721515b 10from ..compat import (
11 compat_etree_Element,
12 compat_HTTPError,
1bdae7d3 13 compat_str,
3721515b 14 compat_urlparse,
15)
8683b4d8 16from ..utils import (
3721515b 17 ExtractorError,
1418a043 18 OnDemandPagedList,
97067db2 19 clean_html,
9fb64c04 20 dict_get,
9afa1770 21 float_or_none,
97067db2 22 get_element_by_class,
8683b4d8 23 int_or_none,
6d155707 24 js_to_json,
9afa1770
S
25 parse_duration,
26 parse_iso8601,
4dfbf869 27 parse_qs,
1bdae7d3 28 strip_or_none,
9fb64c04 29 try_get,
dab062fb 30 unescapeHTML,
1bdae7d3 31 unified_timestamp,
f0228f56 32 url_or_none,
97067db2
S
33 urlencode_postdata,
34 urljoin,
8683b4d8 35)
082c6c86 36
d12a1a47 37
f13b1e7d 38class BBCCoUkIE(InfoExtractor):
082c6c86 39 IE_NAME = 'bbc.co.uk'
2e3fd9ec 40 IE_DESC = 'BBC iPlayer'
6f356cbb 41 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
42 _VALID_URL = r'''(?x)
43 https?://
44 (?:www\.)?bbc\.co\.uk/
45 (?:
46 programmes/(?!articles/)|
47 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 48 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 49 radio/player/|
b72305f0 50 sounds/play/|
d3d45e0a 51 events/[^/]+/play/[^/]+/
f20a11ed 52 )
ded7511a 53 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 54 ''' % _ID_REGEX
082c6c86 55
97067db2
S
56 _LOGIN_URL = 'https://account.bbc.com/signin'
57 _NETRC_MACHINE = 'bbc'
58
29f7c58a 59 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
60 _MEDIA_SETS = [
26ccc68b
S
61 # Provides HQ HLS streams with even better quality that pc mediaset but fails
62 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 63 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 64 'iptv-all',
65 'pc',
d12a1a47 66 ]
a8b081a0 67
e6174ee9
S
68 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
69
2e3fd9ec
S
70 _TESTS = [
71 {
f2d0fc68 72 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 73 'info_dict': {
f2d0fc68 74 'id': 'b039d07m',
b1ea6802 75 'ext': 'flv',
acc86c9a 76 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 77 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
78 },
79 'params': {
b1ea6802 80 # rtmp download
2e3fd9ec
S
81 'skip_download': True,
82 }
082c6c86 83 },
2e3fd9ec
S
84 {
85 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
86 'info_dict': {
87 'id': 'b00yng1d',
88 'ext': 'flv',
89 'title': 'The Man in Black: Series 3: The Printed Name',
90 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
91 'duration': 1800,
92 },
93 'params': {
94 # rtmp download
95 'skip_download': True,
c7f0177f
S
96 },
97 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
98 },
99 {
100 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
101 'info_dict': {
102 'id': 'b00yng1d',
103 'ext': 'flv',
17968e44 104 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 105 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 106 'duration': 5100,
2e3fd9ec
S
107 },
108 'params': {
109 # rtmp download
110 'skip_download': True,
111 },
b1ea6802 112 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
113 },
114 {
115 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
116 'info_dict': {
117 'id': 'b03k3pb7',
118 'ext': 'flv',
119 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
120 'description': '2. Invasion',
121 'duration': 3600,
122 },
123 'params': {
124 # rtmp download
125 'skip_download': True,
126 },
b1ea6802 127 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
128 }, {
129 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
130 'info_dict': {
131 'id': 'b04v209v',
132 'ext': 'flv',
133 'title': 'Pete Tong, The Essential New Tune Special',
134 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
135 'duration': 10800,
136 },
137 'params': {
138 # rtmp download
139 'skip_download': True,
a3ef0e1c
YCH
140 },
141 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 142 }, {
5aa535c3 143 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
144 'note': 'Audio',
145 'info_dict': {
5aa535c3 146 'id': 'p022h44j',
b1ea6802 147 'ext': 'flv',
5aa535c3
S
148 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
149 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
150 'duration': 227,
c7e67594
S
151 },
152 'params': {
b1ea6802 153 # rtmp download
c7e67594
S
154 'skip_download': True,
155 }
156 }, {
157 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
158 'note': 'Video',
159 'info_dict': {
160 'id': 'p025c103',
b1ea6802 161 'ext': 'flv',
c7e67594
S
162 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
163 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
164 'duration': 226,
165 },
166 'params': {
b1ea6802 167 # rtmp download
c7e67594
S
168 'skip_download': True,
169 }
e68ae99a
S
170 }, {
171 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
172 'info_dict': {
173 'id': 'p02n76xf',
174 'ext': 'flv',
175 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
176 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
177 'duration': 3540,
178 },
179 'params': {
180 # rtmp download
181 'skip_download': True,
182 },
b1ea6802 183 'skip': 'geolocation',
25fa8d66
YCH
184 }, {
185 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
186 'info_dict': {
187 'id': 'b05zmgw1',
188 'ext': 'flv',
189 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
190 'title': 'Royal Academy Summer Exhibition',
191 'duration': 3540,
192 },
193 'params': {
194 # rtmp download
195 'skip_download': True,
196 },
b1ea6802 197 'skip': 'geolocation',
54914380
S
198 }, {
199 # iptv-all mediaset fails with geolocation however there is no geo restriction
200 # for this programme at all
5aa535c3 201 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 202 'info_dict': {
5aa535c3 203 'id': 'b06rkms3',
54914380 204 'ext': 'flv',
5aa535c3
S
205 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
206 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
207 },
208 'params': {
209 # rtmp download
210 'skip_download': True,
211 },
b1ea6802 212 'skip': 'Now it\'s really geo-restricted',
1ac6e794 213 }, {
067aa17e 214 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
215 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
216 'info_dict': {
217 'id': 'p028bfkj',
b1ea6802 218 'ext': 'flv',
1ac6e794
S
219 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
220 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
221 },
222 'params': {
b1ea6802 223 # rtmp download
1ac6e794
S
224 'skip_download': True,
225 },
b72305f0
J
226 }, {
227 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
228 'note': 'Audio',
229 'info_dict': {
230 'id': 'm0007jz9',
231 'ext': 'mp4',
232 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
233 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
234 'duration': 9840,
235 },
236 'params': {
237 # rtmp download
238 'skip_download': True,
239 }
31763975
S
240 }, {
241 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
242 'only_matching': True,
c7e67594
S
243 }, {
244 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
245 'only_matching': True,
0692ef86
S
246 }, {
247 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
248 'only_matching': True,
f20a11ed
S
249 }, {
250 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
251 'only_matching': True,
72d256c4
S
252 }, {
253 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
254 'only_matching': True,
53647dfd
S
255 }, {
256 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
257 'only_matching': True,
6f356cbb
S
258 }, {
259 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
260 'only_matching': True,
261 }, {
262 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
263 'only_matching': True,
72d256c4 264 }]
2e3fd9ec 265
97067db2
S
266 def _login(self):
267 username, password = self._get_login_info()
268 if username is None:
269 return
270
271 login_page = self._download_webpage(
272 self._LOGIN_URL, None, 'Downloading signin page')
273
274 login_form = self._hidden_inputs(login_page)
275
276 login_form.update({
277 'username': username,
278 'password': password,
279 })
280
281 post_url = urljoin(self._LOGIN_URL, self._search_regex(
282 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
283 'post url', default=self._LOGIN_URL, group='url'))
284
285 response, urlh = self._download_webpage_handle(
286 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
287 headers={'Referer': self._LOGIN_URL})
288
289 if self._LOGIN_URL in urlh.geturl():
290 error = clean_html(get_element_by_class('form-message', response))
291 if error:
292 raise ExtractorError(
293 'Unable to login: %s' % error, expected=True)
294 raise ExtractorError('Unable to log in')
295
296 def _real_initialize(self):
297 self._login()
298
d12a1a47
S
299 class MediaSelectionError(Exception):
300 def __init__(self, id):
301 self.id = id
302
2e3fd9ec
S
303 def _extract_asx_playlist(self, connection, programme_id):
304 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
305 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
306
2e3fd9ec 307 def _extract_items(self, playlist):
e6174ee9
S
308 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
309
2e3fd9ec 310 def _extract_medias(self, media_selection):
29f7c58a 311 error = media_selection.get('result')
312 if error:
313 raise BBCCoUkIE.MediaSelectionError(error)
314 return media_selection.get('media') or []
2e3fd9ec
S
315
316 def _extract_connections(self, media):
29f7c58a 317 return media.get('connection') or []
2e3fd9ec 318
f13b1e7d 319 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
320 subtitles = {}
321 for connection in self._extract_connections(media):
f0228f56
S
322 cc_url = url_or_none(connection.get('href'))
323 if not cc_url:
324 continue
325 captions = self._download_xml(
326 cc_url, programme_id, 'Downloading captions', fatal=False)
ee0ba927 327 if not isinstance(captions, compat_etree_Element):
f0228f56 328 continue
29f7c58a 329 subtitles['en'] = [
f13b1e7d
JMF
330 {
331 'url': connection.get('href'),
332 'ext': 'ttml',
333 },
f13b1e7d 334 ]
29f7c58a 335 break
2e3fd9ec 336 return subtitles
082c6c86 337
d12a1a47
S
338 def _raise_extractor_error(self, media_selection_error):
339 raise ExtractorError(
340 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
341 expected=True)
342
c056efa2 343 def _download_media_selector(self, programme_id):
d12a1a47 344 last_exception = None
29f7c58a 345 for media_set in self._MEDIA_SETS:
d12a1a47
S
346 try:
347 return self._download_media_selector_url(
29f7c58a 348 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 349 except BBCCoUkIE.MediaSelectionError as e:
d781e293 350 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
351 last_exception = e
352 continue
353 self._raise_extractor_error(e)
354 self._raise_extractor_error(last_exception)
9afa1770
S
355
356 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 357 media_selection = self._download_json(
358 url, programme_id, 'Downloading media selection JSON',
9283d4ea 359 expected_status=(403, 404))
9afa1770 360 return self._process_media_selector(media_selection, programme_id)
082c6c86 361
9afa1770 362 def _process_media_selector(self, media_selection, programme_id):
082c6c86 363 formats = []
2e3fd9ec 364 subtitles = None
b0af1215 365 urls = []
2e3fd9ec 366
c056efa2
S
367 for media in self._extract_medias(media_selection):
368 kind = media.get('kind')
a7e5f274
RA
369 if kind in ('video', 'audio'):
370 bitrate = int_or_none(media.get('bitrate'))
371 encoding = media.get('encoding')
a7e5f274
RA
372 width = int_or_none(media.get('width'))
373 height = int_or_none(media.get('height'))
374 file_size = int_or_none(media.get('media_file_size'))
375 for connection in self._extract_connections(media):
b0af1215
RA
376 href = connection.get('href')
377 if href in urls:
378 continue
379 if href:
380 urls.append(href)
a7e5f274
RA
381 conn_kind = connection.get('kind')
382 protocol = connection.get('protocol')
383 supplier = connection.get('supplier')
a7e5f274
RA
384 transfer_format = connection.get('transferFormat')
385 format_id = supplier or conn_kind or protocol
a7e5f274
RA
386 # ASX playlist
387 if supplier == 'asx':
388 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
389 formats.append({
390 'url': ref,
391 'format_id': 'ref%s_%s' % (i, format_id),
392 })
393 elif transfer_format == 'dash':
394 formats.extend(self._extract_mpd_formats(
395 href, programme_id, mpd_id=format_id, fatal=False))
396 elif transfer_format == 'hls':
397 formats.extend(self._extract_m3u8_formats(
398 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
399 m3u8_id=format_id, fatal=False))
400 elif transfer_format == 'hds':
401 formats.extend(self._extract_f4m_formats(
402 href, programme_id, f4m_id=format_id, fatal=False))
403 else:
29f7c58a 404 if not supplier and bitrate:
aaa42cf0 405 format_id += '-%d' % bitrate
a7e5f274
RA
406 fmt = {
407 'format_id': format_id,
408 'filesize': file_size,
409 }
410 if kind == 'video':
411 fmt.update({
412 'width': width,
413 'height': height,
6240925b 414 'tbr': bitrate,
a7e5f274
RA
415 'vcodec': encoding,
416 })
417 else:
418 fmt.update({
419 'abr': bitrate,
420 'acodec': encoding,
421 'vcodec': 'none',
422 })
1af959ef 423 if protocol in ('http', 'https'):
a7e5f274
RA
424 # Direct link
425 fmt.update({
426 'url': href,
427 })
428 elif protocol == 'rtmp':
429 application = connection.get('application', 'ondemand')
430 auth_string = connection.get('authString')
431 identifier = connection.get('identifier')
432 server = connection.get('server')
433 fmt.update({
434 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
435 'play_path': identifier,
436 'app': '%s?%s' % (application, auth_string),
437 'page_url': 'http://www.bbc.co.uk',
438 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
439 'rtmp_live': False,
440 'ext': 'flv',
441 })
964744af
S
442 else:
443 continue
a7e5f274 444 formats.append(fmt)
c056efa2 445 elif kind == 'captions':
f13b1e7d 446 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 447 return formats, subtitles
2e3fd9ec 448
ae6986fb
S
449 def _download_playlist(self, playlist_id):
450 try:
451 playlist = self._download_json(
452 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
453 playlist_id, 'Downloading playlist JSON')
454
455 version = playlist.get('defaultAvailableVersion')
456 if version:
457 smp_config = version['smpConfig']
458 title = smp_config['title']
459 description = smp_config['summary']
460 for item in smp_config['items']:
461 kind = item['kind']
40fcba5e 462 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
463 continue
464 programme_id = item.get('vpid')
d97f5cd7 465 duration = int_or_none(item.get('duration'))
ae6986fb
S
466 formats, subtitles = self._download_media_selector(programme_id)
467 return programme_id, title, description, duration, formats, subtitles
468 except ExtractorError as ee:
f813928e 469 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
470 raise
471
472 # fallback to legacy playlist
9afa1770
S
473 return self._process_legacy_playlist(playlist_id)
474
475 def _process_legacy_playlist_url(self, url, display_id):
476 playlist = self._download_legacy_playlist_url(url, display_id)
477 return self._extract_from_legacy_playlist(playlist, display_id)
478
479 def _process_legacy_playlist(self, playlist_id):
480 return self._process_legacy_playlist_url(
481 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
482
483 def _download_legacy_playlist_url(self, url, playlist_id=None):
484 return self._download_xml(
485 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 486
9afa1770 487 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 488 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
489 if no_items is not None:
490 reason = no_items.get('reason')
491 if reason == 'preAvailability':
492 msg = 'Episode %s is not yet available' % playlist_id
493 elif reason == 'postAvailability':
494 msg = 'Episode %s is no longer available' % playlist_id
495 elif reason == 'noMedia':
496 msg = 'Episode %s is not currently available' % playlist_id
497 else:
498 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
499 raise ExtractorError(msg, expected=True)
500
501 for item in self._extract_items(playlist):
502 kind = item.get('kind')
40fcba5e 503 if kind not in ('programme', 'radioProgramme'):
ae6986fb 504 continue
e6174ee9
S
505 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
506 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 507 description = description_el.text if description_el is not None else None
9afa1770
S
508
509 def get_programme_id(item):
510 def get_from_attributes(item):
32759325 511 for p in ('identifier', 'group'):
9afa1770
S
512 value = item.get(p)
513 if value and re.match(r'^[pb][\da-z]{7}$', value):
514 return value
515 get_from_attributes(item)
e6174ee9 516 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
517 if mediator is not None:
518 return get_from_attributes(mediator)
519
520 programme_id = get_programme_id(item)
d97f5cd7 521 duration = int_or_none(item.get('duration'))
e6174ee9
S
522
523 if programme_id:
524 formats, subtitles = self._download_media_selector(programme_id)
525 else:
526 formats, subtitles = self._process_media_selector(item, playlist_id)
527 programme_id = playlist_id
ae6986fb
S
528
529 return programme_id, title, description, duration, formats, subtitles
530
c056efa2
S
531 def _real_extract(self, url):
532 group_id = self._match_id(url)
533
534 webpage = self._download_webpage(url, group_id, 'Downloading video page')
535
b2ed954f 536 error = self._search_regex(
29f7c58a 537 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
538 webpage, 'error', default=None)
539 if error:
540 raise ExtractorError(error, expected=True)
541
8683b4d8 542 programme_id = None
679bacf0 543 duration = None
8683b4d8
S
544
545 tviplayer = self._search_regex(
546 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
547 webpage, 'player', default=None)
548
549 if tviplayer:
550 player = self._parse_json(tviplayer, group_id).get('player', {})
551 duration = int_or_none(player.get('duration'))
552 programme_id = player.get('vpid')
553
554 if not programme_id:
555 programme_id = self._search_regex(
22d7368d 556 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 557
c056efa2 558 if programme_id:
c056efa2 559 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 560 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
561 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
562 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 563 description = self._search_regex(
a8534274
S
564 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
565 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
566 webpage, 'description', default=None)
567 if not description:
568 description = self._html_search_meta('description', webpage)
c056efa2 569 else:
ae6986fb 570 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 571
082c6c86
S
572 self._sort_formats(formats)
573
574 return {
2e3fd9ec 575 'id': programme_id,
082c6c86
S
576 'title': title,
577 'description': description,
650cfd0c 578 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
579 'duration': duration,
580 'formats': formats,
2e3fd9ec 581 'subtitles': subtitles,
5f6a1245 582 }
10273d6e 583
584
9afa1770
S
585class BBCIE(BBCCoUkIE):
586 IE_NAME = 'bbc'
587 IE_DESC = 'BBC'
588 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 589
29f7c58a 590 _MEDIA_SETS = [
29f7c58a 591 'pc',
2d997542 592 'mobile-tablet-main',
d12a1a47 593 ]
10273d6e 594
595 _TESTS = [{
6a747190 596 # article with multiple videos embedded with data-playable containing vpids
10273d6e 597 'url': 'http://www.bbc.com/news/world-europe-32668511',
598 'info_dict': {
599 'id': 'world-europe-32668511',
acc86c9a 600 'title': 'Russia stages massive WW2 parade',
9afa1770 601 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 602 },
603 'playlist_count': 2,
a3bfddfa 604 }, {
6a747190 605 # article with multiple videos embedded with data-playable (more videos)
10273d6e 606 'url': 'http://www.bbc.com/news/business-28299555',
607 'info_dict': {
608 'id': 'business-28299555',
609 'title': 'Farnborough Airshow: Video highlights',
9afa1770 610 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 611 },
612 'playlist_count': 9,
9afa1770 613 'skip': 'Save time',
88ed52ae
S
614 }, {
615 # article with multiple videos embedded with `new SMP()`
6a747190 616 # broken
88ed52ae
S
617 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
618 'info_dict': {
619 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 620 'title': 'BUGGER',
88ed52ae
S
621 },
622 'playlist_count': 18,
a3bfddfa 623 }, {
6a747190 624 # single video embedded with data-playable containing vpid
10273d6e 625 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 626 'info_dict': {
627 'id': 'p02mprgb',
55ebae26 628 'ext': 'mp4',
10273d6e 629 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 630 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 631 'duration': 47,
9afa1770 632 'timestamp': 1427219242,
da92eeae 633 'upload_date': '20150324',
10273d6e 634 },
635 'params': {
9afa1770 636 # rtmp download
10273d6e 637 'skip_download': True,
638 }
a3bfddfa 639 }, {
6a747190
S
640 # article with single video embedded with data-playable containing XML playlist
641 # with direct video links as progressiveDownloadUrl (for now these are extracted)
642 # and playlist with f4m and m3u8 as streamingUrl
de939d89 643 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 644 'info_dict': {
9afa1770 645 'id': '150615_telabyad_kentin_cogu',
de939d89 646 'ext': 'mp4',
ad152e2d 647 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 648 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 649 'timestamp': 1434397334,
da92eeae 650 'upload_date': '20150615',
de939d89 651 },
652 'params': {
653 'skip_download': True,
654 }
c936d8cc 655 }, {
6a747190 656 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 657 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 658 'info_dict': {
9afa1770 659 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 660 'ext': 'mp4',
9afa1770 661 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 662 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 663 'timestamp': 1434713142,
da92eeae 664 'upload_date': '20150619',
de939d89 665 },
666 'params': {
667 'skip_download': True,
668 }
a346b1ff
S
669 }, {
670 # single video from video playlist embedded with vxp-playlist-data JSON
671 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
672 'info_dict': {
673 'id': 'p02w6qjc',
55ebae26 674 'ext': 'mp4',
a346b1ff
S
675 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
676 'duration': 56,
0bc4ee60 677 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
678 },
679 'params': {
680 'skip_download': True,
681 }
9afa1770
S
682 }, {
683 # single video story with digitalData
684 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
685 'info_dict': {
686 'id': 'p02q6gc4',
687 'ext': 'flv',
688 'title': 'Sri Lanka’s spicy secret',
689 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
690 'timestamp': 1437674293,
691 'upload_date': '20150723',
692 },
693 'params': {
694 # rtmp download
695 'skip_download': True,
696 }
697 }, {
698 # single video story without digitalData
699 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
700 'info_dict': {
701 'id': 'p018zqqg',
55ebae26 702 'ext': 'mp4',
9afa1770
S
703 'title': 'Hyundai Santa Fe Sport: Rock star',
704 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
705 'timestamp': 1415867444,
706 'upload_date': '20141113',
9afa1770
S
707 },
708 'params': {
709 # rtmp download
710 'skip_download': True,
711 }
9fb64c04
S
712 }, {
713 # single video embedded with Morph
714 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
715 'info_dict': {
716 'id': 'p041vhd0',
717 'ext': 'mp4',
718 'title': "Nigeria v Japan - Men's First Round",
719 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
720 'duration': 7980,
721 'uploader': 'BBC Sport',
722 'uploader_id': 'bbc_sport',
723 },
724 'params': {
725 # m3u8 download
726 'skip_download': True,
9fb64c04
S
727 },
728 'skip': 'Georestricted to UK',
9afa1770 729 }, {
6a747190 730 # single video with playlist.sxml URL in playlist param
9afa1770
S
731 'url': 'http://www.bbc.com/sport/0/football/33653409',
732 'info_dict': {
733 'id': 'p02xycnp',
55ebae26 734 'ext': 'mp4',
9afa1770 735 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 736 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
737 'duration': 140,
738 },
739 'params': {
740 # rtmp download
741 'skip_download': True,
742 }
b5d48cb1 743 }, {
6a747190 744 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
745 'url': 'http://www.bbc.com/sport/0/football/34475836',
746 'info_dict': {
747 'id': '34475836',
450b233c 748 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 749 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
750 },
751 'playlist_count': 3,
450b233c
S
752 }, {
753 # school report article with single video
754 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
755 'info_dict': {
756 'id': '35744779',
757 'title': 'School which breaks down barriers in Jerusalem',
758 },
759 'playlist_count': 1,
9afa1770
S
760 }, {
761 # single video with playlist URL from weather section
762 'url': 'http://www.bbc.com/weather/features/33601775',
763 'only_matching': True,
764 }, {
765 # custom redirection to www.bbc.com
1bdae7d3 766 # also, video with window.__INITIAL_DATA__
9afa1770 767 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
1bdae7d3 768 'info_dict': {
769 'id': 'p02xzws1',
770 'ext': 'mp4',
771 'title': "Pluto may have 'nitrogen glaciers'",
772 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
773 'thumbnail': r're:https?://.+/.+\.jpg',
774 'timestamp': 1437785037,
775 'upload_date': '20150725',
776 },
a1cf3e38
S
777 }, {
778 # single video article embedded with data-media-vpid
779 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
780 'only_matching': True,
6d155707
S
781 }, {
782 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
783 'info_dict': {
784 'id': 'p06556y7',
785 'ext': 'mp4',
786 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
787 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
788 },
789 'params': {
790 'skip_download': True,
791 }
b96b4be4
RA
792 }, {
793 # window.__PRELOADED_STATE__
794 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
795 'info_dict': {
796 'id': 'b0b9z4vz',
797 'ext': 'mp4',
798 'title': 'Prom 6: An American in Paris and Turangalila',
799 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
800 'uploader': 'Radio 3',
801 'uploader_id': 'bbc_radio_three',
802 },
373941c5
S
803 }, {
804 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
805 'info_dict': {
806 'id': 'p06w9tws',
807 'ext': 'mp4',
808 'title': 'md5:2fabf12a726603193a2879a055f72514',
809 'description': 'Learn English words and phrases from this story',
810 },
811 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 812 }, {
813 # BBC Reel
814 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
815 'info_dict': {
816 'id': 'p07c6sb9',
817 'ext': 'mp4',
818 'title': 'How positive thinking is harming your happiness',
819 'alt_title': 'The downsides of positive thinking',
820 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
821 'duration': 235,
822 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
823 'upload_date': '20190604',
824 'categories': ['Psychology'],
825 },
10273d6e 826 }]
827
9afa1770
S
828 @classmethod
829 def suitable(cls, url):
1418a043 830 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
831 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
832 else super(BBCIE, cls).suitable(url))
9afa1770
S
833
834 def _extract_from_media_meta(self, media_meta, video_id):
835 # Direct links to media in media metadata (e.g.
836 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
837 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
838 source_files = media_meta.get('sourceFiles')
839 if source_files:
840 return [{
841 'url': f['url'],
842 'format_id': format_id,
843 'ext': f.get('encoding'),
844 'tbr': float_or_none(f.get('bitrate'), 1000),
845 'filesize': int_or_none(f.get('filesize')),
846 } for format_id, f in source_files.items() if f.get('url')], []
847
848 programme_id = media_meta.get('externalId')
849 if programme_id:
850 return self._download_media_selector(programme_id)
851
852 # Process playlist.sxml as legacy playlist
853 href = media_meta.get('href')
854 if href:
855 playlist = self._download_legacy_playlist_url(href)
856 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
857 return formats, subtitles
858
859 return [], []
860
baf39a1a
S
861 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
862 programme_id, title, description, duration, formats, subtitles = \
863 self._process_legacy_playlist_url(url, playlist_id)
864 self._sort_formats(formats)
865 return {
866 'id': programme_id,
867 'title': title,
868 'description': description,
869 'duration': duration,
870 'timestamp': timestamp,
871 'formats': formats,
872 'subtitles': subtitles,
873 }
874
10273d6e 875 def _real_extract(self, url):
9afa1770
S
876 playlist_id = self._match_id(url)
877
878 webpage = self._download_webpage(url, playlist_id)
879
522f6c06 880 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 881 timestamp = json_ld_info.get('timestamp')
0e832c2c 882
350e02d4 883 playlist_title = json_ld_info.get('title')
0e832c2c
S
884 if not playlist_title:
885 playlist_title = self._og_search_title(
886 webpage, default=None) or self._html_search_regex(
887 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
888 if playlist_title:
889 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
890
891 playlist_description = json_ld_info.get(
892 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
893
894 if not timestamp:
895 timestamp = parse_iso8601(self._search_regex(
896 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
897 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 898 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 899 webpage, 'date', default=None))
9afa1770 900
78f9d843
S
901 entries = []
902
de665713
S
903 # article with multiple videos embedded with playlist.sxml (e.g.
904 # http://www.bbc.com/sport/0/football/34475836)
905 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 906 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 907 if playlists:
baf39a1a
S
908 entries = [
909 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
910 for playlist_url in playlists]
de939d89 911
78f9d843
S
912 # news article with multiple videos embedded with data-playable
913 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
914 if data_playables:
915 for _, data_playable_json in data_playables:
916 data_playable = self._parse_json(
917 unescapeHTML(data_playable_json), playlist_id, fatal=False)
918 if not data_playable:
919 continue
baf39a1a
S
920 settings = data_playable.get('settings', {})
921 if settings:
78f9d843
S
922 # data-playable with video vpid in settings.playlistObject.items (e.g.
923 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
924 playlist_object = settings.get('playlistObject', {})
925 if playlist_object:
926 items = playlist_object.get('items')
927 if items and isinstance(items, list):
78f9d843
S
928 title = playlist_object['title']
929 description = playlist_object.get('summary')
baf39a1a
S
930 duration = int_or_none(items[0].get('duration'))
931 programme_id = items[0].get('vpid')
78f9d843
S
932 formats, subtitles = self._download_media_selector(programme_id)
933 self._sort_formats(formats)
934 entries.append({
935 'id': programme_id,
936 'title': title,
937 'description': description,
938 'timestamp': timestamp,
939 'duration': duration,
940 'formats': formats,
941 'subtitles': subtitles,
942 })
943 else:
944 # data-playable without vpid but with a playlist.sxml URLs
945 # in otherSettings.playlist (e.g.
946 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
947 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
948 if playlist:
a7e5f274
RA
949 entry = None
950 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
951 playlist_url = playlist.get('%sUrl' % key)
952 if not playlist_url:
953 continue
954 try:
a7e5f274
RA
955 info = self._extract_from_playlist_sxml(
956 playlist_url, playlist_id, timestamp)
957 if not entry:
958 entry = info
959 else:
960 entry['title'] = info['title']
961 entry['formats'].extend(info['formats'])
3721515b 962 except ExtractorError as e:
05087d1b
S
963 # Some playlist URL may fail with 500, at the same time
964 # the other one may work fine (e.g.
965 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
966 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
967 continue
968 raise
a7e5f274
RA
969 if entry:
970 self._sort_formats(entry['formats'])
971 entries.append(entry)
78f9d843
S
972
973 if entries:
78f9d843
S
974 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
975
373941c5
S
976 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
977 group_id = self._search_regex(
978 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
979 webpage, 'group id', default=None)
38d70284 980 if group_id:
373941c5
S
981 return self.url_result(
982 'https://www.bbc.co.uk/programmes/%s' % group_id,
983 ie=BBCCoUkIE.ie_key())
984
78f9d843
S
985 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
986 programme_id = self._search_regex(
a1cf3e38 987 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
988 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
989 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 990 webpage, 'vpid', default=None)
dab062fb 991
9afa1770
S
992 if programme_id:
993 formats, subtitles = self._download_media_selector(programme_id)
994 self._sort_formats(formats)
995 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
996 digital_data = self._parse_json(
997 self._search_regex(
998 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
999 programme_id, fatal=False)
1000 page_info = digital_data.get('page', {}).get('pageInfo', {})
1001 title = page_info.get('pageName') or self._og_search_title(webpage)
1002 description = page_info.get('description') or self._og_search_description(webpage)
1003 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1004 return {
1005 'id': programme_id,
1006 'title': title,
1007 'description': description,
1008 'timestamp': timestamp,
1009 'formats': formats,
1010 'subtitles': subtitles,
1011 }
a3bfddfa 1012
3721515b 1013 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1014 initial_data = self._parse_json(self._html_search_regex(
1015 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1016 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1017 if initial_data:
1018 init_data = try_get(
1019 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1020 smp_data = init_data.get('smpData') or {}
1021 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1022 version_id = clip_data.get('versionID')
1023 if version_id:
1024 title = smp_data['title']
1025 formats, subtitles = self._download_media_selector(version_id)
1026 self._sort_formats(formats)
1027 image_url = smp_data.get('holdingImageURL')
1028 display_date = init_data.get('displayDate')
1029 topic_title = init_data.get('topicTitle')
1030
1031 return {
1032 'id': version_id,
1033 'title': title,
1034 'formats': formats,
1035 'alt_title': init_data.get('shortTitle'),
1036 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1037 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1038 'upload_date': display_date.replace('-', '') if display_date else None,
1039 'subtitles': subtitles,
1040 'duration': int_or_none(clip_data.get('duration')),
1041 'categories': [topic_title] if topic_title else None,
1042 }
1043
9fb64c04
S
1044 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1045 # There are several setPayload calls may be present but the video
1046 # seems to be always related to the first one
1047 morph_payload = self._parse_json(
1048 self._search_regex(
1049 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1050 webpage, 'morph payload', default='{}'),
1051 playlist_id, fatal=False)
1052 if morph_payload:
1053 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1054 for component in components:
1055 if not isinstance(component, dict):
1056 continue
1057 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1058 if not lead_media:
1059 continue
1060 identifiers = lead_media.get('identifiers')
1061 if not identifiers or not isinstance(identifiers, dict):
1062 continue
1063 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1064 if not programme_id:
1065 continue
1066 title = lead_media.get('title') or self._og_search_title(webpage)
1067 formats, subtitles = self._download_media_selector(programme_id)
1068 self._sort_formats(formats)
1069 description = lead_media.get('summary')
1070 uploader = lead_media.get('masterBrand')
1071 uploader_id = lead_media.get('mid')
1072 duration = None
1073 duration_d = lead_media.get('duration')
1074 if isinstance(duration_d, dict):
1075 duration = parse_duration(dict_get(
1076 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1077 return {
1078 'id': programme_id,
1079 'title': title,
1080 'description': description,
1081 'duration': duration,
1082 'uploader': uploader,
1083 'uploader_id': uploader_id,
1084 'formats': formats,
1085 'subtitles': subtitles,
1086 }
1087
b96b4be4
RA
1088 preload_state = self._parse_json(self._search_regex(
1089 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1090 'preload state', default='{}'), playlist_id, fatal=False)
1091 if preload_state:
1092 current_programme = preload_state.get('programmes', {}).get('current') or {}
1093 programme_id = current_programme.get('id')
1094 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1095 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1096 formats, subtitles = self._download_media_selector(programme_id)
1097 self._sort_formats(formats)
1098 synopses = current_programme.get('synopses') or {}
1099 network = current_programme.get('network') or {}
1100 duration = int_or_none(
1101 current_programme.get('duration', {}).get('value'))
1102 thumbnail = None
1103 image_url = current_programme.get('image_url')
1104 if image_url:
3721515b 1105 thumbnail = image_url.replace('{recipe}', 'raw')
b96b4be4
RA
1106 return {
1107 'id': programme_id,
1108 'title': title,
1109 'description': dict_get(synopses, ('long', 'medium', 'short')),
1110 'thumbnail': thumbnail,
1111 'duration': duration,
1112 'uploader': network.get('short_title'),
1113 'uploader_id': network.get('id'),
1114 'formats': formats,
1115 'subtitles': subtitles,
1116 }
1117
6d155707
S
1118 bbc3_config = self._parse_json(
1119 self._search_regex(
1120 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1121 'bbcthree config', default='{}'),
38d70284 1122 playlist_id, transform_source=js_to_json, fatal=False) or {}
1123 payload = bbc3_config.get('payload') or {}
1124 if payload:
1125 clip = payload.get('currentClip') or {}
1126 clip_vpid = clip.get('vpid')
1127 clip_title = clip.get('title')
1128 if clip_vpid and clip_title:
1129 formats, subtitles = self._download_media_selector(clip_vpid)
1130 self._sort_formats(formats)
1131 return {
1132 'id': clip_vpid,
1133 'title': clip_title,
1134 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1135 'description': clip.get('description'),
1136 'duration': parse_duration(clip.get('duration')),
1137 'formats': formats,
1138 'subtitles': subtitles,
1139 }
6d155707 1140 bbc3_playlist = try_get(
38d70284 1141 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1142 dict)
1143 if bbc3_playlist:
1144 playlist_title = bbc3_playlist.get('title') or playlist_title
1145 thumbnail = bbc3_playlist.get('holdingImageURL')
1146 entries = []
1147 for bbc3_item in bbc3_playlist['items']:
1148 programme_id = bbc3_item.get('versionID')
1149 if not programme_id:
1150 continue
1151 formats, subtitles = self._download_media_selector(programme_id)
1152 self._sort_formats(formats)
1153 entries.append({
1154 'id': programme_id,
1155 'title': playlist_title,
1156 'thumbnail': thumbnail,
1157 'timestamp': timestamp,
1158 'formats': formats,
1159 'subtitles': subtitles,
1160 })
1161 return self.playlist_result(
1162 entries, playlist_id, playlist_title, playlist_description)
1163
38d70284 1164 initial_data = self._parse_json(self._search_regex(
1165 r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
1166 'preload state', default='{}'), playlist_id, fatal=False)
1167 if initial_data:
1168 def parse_media(media):
1169 if not media:
1170 return
1171 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1172 item_id = item.get('id')
1173 item_title = item.get('title')
1174 if not (item_id and item_title):
1175 continue
1176 formats, subtitles = self._download_media_selector(item_id)
1177 self._sort_formats(formats)
1bdae7d3 1178 item_desc = None
1179 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1180 if blocks:
1181 summary = []
1182 for block in blocks:
1183 text = try_get(block, lambda x: x['model']['text'], compat_str)
1184 if text:
1185 summary.append(text)
1186 if summary:
1187 item_desc = '\n\n'.join(summary)
1188 item_time = None
1189 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1190 if try_get(meta, lambda x: x['label']) == 'Published':
1191 item_time = unified_timestamp(meta.get('timestamp'))
1192 break
38d70284 1193 entries.append({
1194 'id': item_id,
1195 'title': item_title,
1196 'thumbnail': item.get('holdingImageUrl'),
1197 'formats': formats,
1198 'subtitles': subtitles,
1bdae7d3 1199 'timestamp': item_time,
1200 'description': strip_or_none(item_desc),
38d70284 1201 })
1202 for resp in (initial_data.get('data') or {}).values():
1203 name = resp.get('name')
1204 if name == 'media-experience':
1205 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1206 elif name == 'article':
1207 for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
1208 if block.get('type') != 'media':
1209 continue
1210 parse_media(block.get('model'))
1211 return self.playlist_result(
1212 entries, playlist_id, playlist_title, playlist_description)
1213
88ed52ae
S
1214 def extract_all(pattern):
1215 return list(filter(None, map(
1216 lambda s: self._parse_json(s, playlist_id, fatal=False),
1217 re.findall(pattern, webpage))))
1218
1219 # Multiple video article (e.g.
1220 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1221 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1222 entries = []
1223 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1224 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1225 if embed_url and re.match(EMBED_URL, embed_url):
1226 entries.append(embed_url)
1227 entries.extend(re.findall(
1228 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1229 if entries:
1230 return self.playlist_result(
aaa42cf0 1231 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1232 playlist_id, playlist_title, playlist_description)
9afa1770
S
1233
1234 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1235 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1236
1237 if not medias:
1238 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1239 media_asset = self._search_regex(
1240 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1241 webpage, 'media asset', default=None)
1242 if media_asset:
1243 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1244 medias = []
1245 for video in media_asset_page.get('videos', {}).values():
1246 medias.extend(video.values())
1247
1248 if not medias:
1249 # Multiple video playlist with single `now playing` entry (e.g.
1250 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1251 vxp_playlist = self._parse_json(
9afa1770 1252 self._search_regex(
a346b1ff
S
1253 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1254 webpage, 'playlist data'),
9afa1770 1255 playlist_id)
a346b1ff
S
1256 playlist_medias = []
1257 for item in vxp_playlist:
1258 media = item.get('media')
1259 if not media:
1260 continue
1261 playlist_medias.append(media)
1262 # Download single video if found media with asset id matching the video id from URL
1263 if item.get('advert', {}).get('assetId') == playlist_id:
1264 medias = [media]
1265 break
1266 # Fallback to the whole playlist
1267 if not medias:
1268 medias = playlist_medias
9afa1770
S
1269
1270 entries = []
1271 for num, media_meta in enumerate(medias, start=1):
1272 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
a06916d9 1273 if not formats and not self.get_param('ignore_no_formats'):
9afa1770 1274 continue
10273d6e 1275 self._sort_formats(formats)
1276
9afa1770
S
1277 video_id = media_meta.get('externalId')
1278 if not video_id:
1279 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1280
1281 title = media_meta.get('caption')
1282 if not title:
1283 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1284
1285 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1286
9afa1770
S
1287 images = []
1288 for image in media_meta.get('images', {}).values():
1289 images.extend(image.values())
1290 if 'image' in media_meta:
1291 images.append(media_meta['image'])
1292
1293 thumbnails = [{
1294 'url': image.get('href'),
1295 'width': int_or_none(image.get('width')),
1296 'height': int_or_none(image.get('height')),
1297 } for image in images]
1298
1299 entries.append({
1300 'id': video_id,
10273d6e 1301 'title': title,
9afa1770 1302 'thumbnails': thumbnails,
10273d6e 1303 'duration': duration,
9afa1770 1304 'timestamp': timestamp,
10273d6e 1305 'formats': formats,
1306 'subtitles': subtitles,
a3bfddfa 1307 })
10273d6e 1308
9afa1770 1309 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1310
1311
1312class BBCCoUkArticleIE(InfoExtractor):
92519402 1313 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1314 IE_NAME = 'bbc.co.uk:article'
1315 IE_DESC = 'BBC articles'
1316
1317 _TEST = {
1318 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1319 'info_dict': {
1320 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1321 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1322 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1323 },
1324 'playlist_count': 4,
1325 'add_ie': ['BBCCoUk'],
1326 }
1327
1328 def _real_extract(self, url):
1329 playlist_id = self._match_id(url)
1330
1331 webpage = self._download_webpage(url, playlist_id)
1332
1333 title = self._og_search_title(webpage)
1334 description = self._og_search_description(webpage).strip()
1335
1336 entries = [self.url_result(programme_url) for programme_url in re.findall(
1337 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1338
1339 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1340
1341
1342class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1343 def _entries(self, webpage, url, playlist_id):
1344 single_page = 'page' in compat_urlparse.parse_qs(
1345 compat_urlparse.urlparse(url).query)
1346 for page_num in itertools.count(2):
1347 for video_id in re.findall(
1348 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1349 yield self.url_result(
1350 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1351 if single_page:
1352 return
1353 next_page = self._search_regex(
1354 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1355 webpage, 'next page url', default=None, group='url')
1356 if not next_page:
1357 break
1358 webpage = self._download_webpage(
1359 compat_urlparse.urljoin(url, next_page), playlist_id,
1360 'Downloading page %d' % page_num, page_num)
1361
ded7511a
S
1362 def _real_extract(self, url):
1363 playlist_id = self._match_id(url)
1364
1365 webpage = self._download_webpage(url, playlist_id)
1366
ded7511a
S
1367 title, description = self._extract_title_and_description(webpage)
1368
254e64a2
S
1369 return self.playlist_result(
1370 self._entries(webpage, url, playlist_id),
1371 playlist_id, title, description)
ded7511a
S
1372
1373
1418a043 1374class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1375 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1376
1377 @staticmethod
1378 def _get_default(episode, key, default_key='default'):
1379 return try_get(episode, lambda x: x[key][default_key])
1380
1381 def _get_description(self, data):
1382 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1383 return dict_get(synopsis, ('large', 'medium', 'small'))
1384
1385 def _fetch_page(self, programme_id, per_page, series_id, page):
1386 elements = self._get_elements(self._call_api(
1387 programme_id, per_page, page + 1, series_id))
1388 for element in elements:
1389 episode = self._get_episode(element)
1390 episode_id = episode.get('id')
1391 if not episode_id:
1392 continue
1393 thumbnail = None
1394 image = self._get_episode_image(episode)
1395 if image:
1396 thumbnail = image.replace('{recipe}', 'raw')
1397 category = self._get_default(episode, 'labels', 'category')
1398 yield {
1399 '_type': 'url',
1400 'id': episode_id,
1401 'title': self._get_episode_field(episode, 'subtitle'),
1402 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1403 'thumbnail': thumbnail,
1404 'description': self._get_description(episode),
1405 'categories': [category] if category else None,
1406 'series': self._get_episode_field(episode, 'title'),
1407 'ie_key': BBCCoUkIE.ie_key(),
1408 }
1409
1410 def _real_extract(self, url):
1411 pid = self._match_id(url)
4dfbf869 1412 qs = parse_qs(url)
1418a043 1413 series_id = qs.get('seriesId', [None])[0]
1414 page = qs.get('page', [None])[0]
1415 per_page = 36 if page else self._PAGE_SIZE
1416 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1417 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1418 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1419 return self.playlist_result(
1420 entries, pid, self._get_playlist_title(playlist_data),
1421 self._get_description(playlist_data))
1422
1423
1424class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1425 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1426 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1427 _TESTS = [{
ded7511a
S
1428 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1429 'info_dict': {
1430 'id': 'b05rcz9v',
1431 'title': 'The Disappearance',
1418a043 1432 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1433 },
1418a043 1434 'playlist_mincount': 8,
9158af16 1435 }, {
1418a043 1436 # all seasons
1437 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1438 'info_dict': {
1439 'id': 'b094m5t9',
1440 'title': 'Doctor Foster',
1441 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1442 },
1443 'playlist_mincount': 10,
1444 }, {
1445 # explicit season
1446 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1447 'info_dict': {
1448 'id': 'b094m5t9',
1449 'title': 'Doctor Foster',
1450 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1451 },
1452 'playlist_mincount': 5,
1453 }, {
1454 # all pages
1455 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1456 'info_dict': {
1457 'id': 'm0004c4v',
1458 'title': 'Beechgrove',
1459 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1460 },
1461 'playlist_mincount': 37,
1462 }, {
1463 # explicit page
1464 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1465 'info_dict': {
1466 'id': 'm0004c4v',
1467 'title': 'Beechgrove',
1468 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1469 },
1470 'playlist_mincount': 1,
1471 }]
1472 _PAGE_SIZE = 100
1473 _DESCRIPTION_KEY = 'synopsis'
1474
1475 def _get_episode_image(self, episode):
1476 return self._get_default(episode, 'image')
1477
1478 def _get_episode_field(self, episode, field):
1479 return self._get_default(episode, field)
1480
1481 @staticmethod
1482 def _get_elements(data):
1483 return data['entities']['results']
1484
1485 @staticmethod
1486 def _get_episode(element):
1487 return element.get('episode') or {}
1488
1489 def _call_api(self, pid, per_page, page=1, series_id=None):
1490 variables = {
1491 'id': pid,
1492 'page': page,
1493 'perPage': per_page,
1494 }
1495 if series_id:
1496 variables['sliceId'] = series_id
1497 return self._download_json(
1498 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1499 'Content-Type': 'application/json'
1500 }, data=json.dumps({
1501 'id': '5692d93d5aac8d796a0305e895e61551',
1502 'variables': variables,
1503 }).encode('utf-8'))['data']['programme']
1504
1505 @staticmethod
1506 def _get_playlist_data(data):
1507 return data
1508
1509 def _get_playlist_title(self, data):
1510 return self._get_default(data, 'title')
1511
1512
1513class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1514 IE_NAME = 'bbc.co.uk:iplayer:group'
1515 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1516 _TESTS = [{
9158af16
S
1517 # Available for over a year unlike 30 days for most other programmes
1518 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1519 'info_dict': {
1520 'id': 'p02tcc32',
1521 'title': 'Bohemian Icons',
1522 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1523 },
1524 'playlist_mincount': 10,
1418a043 1525 }, {
1526 # all pages
1527 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1528 'info_dict': {
1529 'id': 'p081d7j7',
1530 'title': 'Music in Scotland',
1531 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1532 },
1533 'playlist_mincount': 47,
1534 }, {
1535 # explicit page
1536 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1537 'info_dict': {
1538 'id': 'p081d7j7',
1539 'title': 'Music in Scotland',
1540 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1541 },
1542 'playlist_mincount': 11,
9158af16 1543 }]
1418a043 1544 _PAGE_SIZE = 200
1545 _DESCRIPTION_KEY = 'synopses'
1546
1547 def _get_episode_image(self, episode):
1548 return self._get_default(episode, 'images', 'standard')
1549
1550 def _get_episode_field(self, episode, field):
1551 return episode.get(field)
1552
1553 @staticmethod
1554 def _get_elements(data):
1555 return data['elements']
1556
1557 @staticmethod
1558 def _get_episode(element):
1559 return element
1560
1561 def _call_api(self, pid, per_page, page=1, series_id=None):
1562 return self._download_json(
1563 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1564 pid, query={
1565 'page': page,
1566 'per_page': per_page,
1567 })['group_episodes']
1568
1569 @staticmethod
1570 def _get_playlist_data(data):
1571 return data['group']
ded7511a 1572
1418a043 1573 def _get_playlist_title(self, data):
1574 return data.get('title')
ded7511a
S
1575
1576
1577class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1578 IE_NAME = 'bbc.co.uk:playlist'
1579 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1580 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1581 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1582 _TESTS = [{
1583 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1584 'info_dict': {
1585 'id': 'b05rcz9v',
1586 'title': 'The Disappearance - Clips - BBC Four',
1587 'description': 'French thriller serial about a missing teenager.',
1588 },
1589 'playlist_mincount': 7,
4f640f28
S
1590 }, {
1591 # multipage playlist, explicit page
1592 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1593 'info_dict': {
1594 'id': 'b00mfl7n',
1595 'title': 'Frozen Planet - Clips - BBC One',
1596 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1597 },
1598 'playlist_mincount': 24,
1599 }, {
1600 # multipage playlist, all pages
1601 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1602 'info_dict': {
1603 'id': 'b00mfl7n',
1604 'title': 'Frozen Planet - Clips - BBC One',
1605 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1606 },
1607 'playlist_mincount': 142,
ded7511a
S
1608 }, {
1609 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1610 'only_matching': True,
1611 }, {
1612 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1613 'only_matching': True,
1614 }, {
1615 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1616 'only_matching': True,
1617 }]
1618
1619 def _extract_title_and_description(self, webpage):
1620 title = self._og_search_title(webpage, fatal=False)
1621 description = self._og_search_description(webpage)
1622 return title, description