]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
[VideocampusSachsen] Add extractors (#2787)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
1418a043 4import functools
254e64a2 5import itertools
1418a043 6import json
f0228f56 7import re
082c6c86 8
f13b1e7d 9from .common import InfoExtractor
3721515b 10from ..compat import (
11 compat_etree_Element,
12 compat_HTTPError,
1bdae7d3 13 compat_str,
3721515b 14 compat_urlparse,
15)
8683b4d8 16from ..utils import (
3721515b 17 ExtractorError,
1418a043 18 OnDemandPagedList,
97067db2 19 clean_html,
9fb64c04 20 dict_get,
9afa1770 21 float_or_none,
97067db2 22 get_element_by_class,
8683b4d8 23 int_or_none,
6d155707 24 js_to_json,
9afa1770
S
25 parse_duration,
26 parse_iso8601,
4dfbf869 27 parse_qs,
1bdae7d3 28 strip_or_none,
9fb64c04 29 try_get,
dab062fb 30 unescapeHTML,
1bdae7d3 31 unified_timestamp,
f0228f56 32 url_or_none,
97067db2
S
33 urlencode_postdata,
34 urljoin,
8683b4d8 35)
082c6c86 36
d12a1a47 37
f13b1e7d 38class BBCCoUkIE(InfoExtractor):
082c6c86 39 IE_NAME = 'bbc.co.uk'
2e3fd9ec 40 IE_DESC = 'BBC iPlayer'
6f356cbb 41 _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
42 _VALID_URL = r'''(?x)
43 https?://
44 (?:www\.)?bbc\.co\.uk/
45 (?:
46 programmes/(?!articles/)|
47 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 48 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a 49 radio/player/|
b72305f0 50 sounds/play/|
d3d45e0a 51 events/[^/]+/play/[^/]+/
f20a11ed 52 )
ded7511a 53 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 54 ''' % _ID_REGEX
082c6c86 55
97067db2
S
56 _LOGIN_URL = 'https://account.bbc.com/signin'
57 _NETRC_MACHINE = 'bbc'
58
29f7c58a 59 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
60 _MEDIA_SETS = [
26ccc68b
S
61 # Provides HQ HLS streams with even better quality that pc mediaset but fails
62 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 63 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 64 'iptv-all',
65 'pc',
d12a1a47 66 ]
a8b081a0 67
e6174ee9
S
68 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
69
2e3fd9ec
S
70 _TESTS = [
71 {
f2d0fc68 72 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 73 'info_dict': {
f2d0fc68 74 'id': 'b039d07m',
b1ea6802 75 'ext': 'flv',
acc86c9a 76 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 77 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
78 },
79 'params': {
b1ea6802 80 # rtmp download
2e3fd9ec
S
81 'skip_download': True,
82 }
082c6c86 83 },
2e3fd9ec
S
84 {
85 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
86 'info_dict': {
87 'id': 'b00yng1d',
88 'ext': 'flv',
89 'title': 'The Man in Black: Series 3: The Printed Name',
90 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
91 'duration': 1800,
92 },
93 'params': {
94 # rtmp download
95 'skip_download': True,
c7f0177f
S
96 },
97 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
98 },
99 {
100 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
101 'info_dict': {
102 'id': 'b00yng1d',
103 'ext': 'flv',
17968e44 104 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 105 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 106 'duration': 5100,
2e3fd9ec
S
107 },
108 'params': {
109 # rtmp download
110 'skip_download': True,
111 },
b1ea6802 112 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
113 },
114 {
115 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
116 'info_dict': {
117 'id': 'b03k3pb7',
118 'ext': 'flv',
119 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
120 'description': '2. Invasion',
121 'duration': 3600,
122 },
123 'params': {
124 # rtmp download
125 'skip_download': True,
126 },
b1ea6802 127 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
128 }, {
129 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
130 'info_dict': {
131 'id': 'b04v209v',
132 'ext': 'flv',
133 'title': 'Pete Tong, The Essential New Tune Special',
134 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
135 'duration': 10800,
136 },
137 'params': {
138 # rtmp download
139 'skip_download': True,
a3ef0e1c
YCH
140 },
141 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 142 }, {
5aa535c3 143 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
144 'note': 'Audio',
145 'info_dict': {
5aa535c3 146 'id': 'p022h44j',
b1ea6802 147 'ext': 'flv',
5aa535c3
S
148 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
149 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
150 'duration': 227,
c7e67594
S
151 },
152 'params': {
b1ea6802 153 # rtmp download
c7e67594
S
154 'skip_download': True,
155 }
156 }, {
157 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
158 'note': 'Video',
159 'info_dict': {
160 'id': 'p025c103',
b1ea6802 161 'ext': 'flv',
c7e67594
S
162 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
163 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
164 'duration': 226,
165 },
166 'params': {
b1ea6802 167 # rtmp download
c7e67594
S
168 'skip_download': True,
169 }
e68ae99a
S
170 }, {
171 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
172 'info_dict': {
173 'id': 'p02n76xf',
174 'ext': 'flv',
175 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
176 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
177 'duration': 3540,
178 },
179 'params': {
180 # rtmp download
181 'skip_download': True,
182 },
b1ea6802 183 'skip': 'geolocation',
25fa8d66
YCH
184 }, {
185 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
186 'info_dict': {
187 'id': 'b05zmgw1',
188 'ext': 'flv',
189 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
190 'title': 'Royal Academy Summer Exhibition',
191 'duration': 3540,
192 },
193 'params': {
194 # rtmp download
195 'skip_download': True,
196 },
b1ea6802 197 'skip': 'geolocation',
54914380
S
198 }, {
199 # iptv-all mediaset fails with geolocation however there is no geo restriction
200 # for this programme at all
5aa535c3 201 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 202 'info_dict': {
5aa535c3 203 'id': 'b06rkms3',
54914380 204 'ext': 'flv',
5aa535c3
S
205 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
206 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
207 },
208 'params': {
209 # rtmp download
210 'skip_download': True,
211 },
b1ea6802 212 'skip': 'Now it\'s really geo-restricted',
1ac6e794 213 }, {
067aa17e 214 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
215 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
216 'info_dict': {
217 'id': 'p028bfkj',
b1ea6802 218 'ext': 'flv',
1ac6e794
S
219 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
220 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
221 },
222 'params': {
b1ea6802 223 # rtmp download
1ac6e794
S
224 'skip_download': True,
225 },
b72305f0
J
226 }, {
227 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
228 'note': 'Audio',
229 'info_dict': {
230 'id': 'm0007jz9',
231 'ext': 'mp4',
232 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
233 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
234 'duration': 9840,
235 },
236 'params': {
237 # rtmp download
238 'skip_download': True,
239 }
31763975
S
240 }, {
241 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
242 'only_matching': True,
c7e67594
S
243 }, {
244 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
245 'only_matching': True,
0692ef86
S
246 }, {
247 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
248 'only_matching': True,
f20a11ed
S
249 }, {
250 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
251 'only_matching': True,
72d256c4
S
252 }, {
253 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
254 'only_matching': True,
53647dfd
S
255 }, {
256 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
257 'only_matching': True,
6f356cbb
S
258 }, {
259 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
260 'only_matching': True,
261 }, {
262 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
263 'only_matching': True,
72d256c4 264 }]
2e3fd9ec 265
97067db2
S
266 def _login(self):
267 username, password = self._get_login_info()
268 if username is None:
269 return
270
271 login_page = self._download_webpage(
272 self._LOGIN_URL, None, 'Downloading signin page')
273
274 login_form = self._hidden_inputs(login_page)
275
276 login_form.update({
277 'username': username,
278 'password': password,
279 })
280
281 post_url = urljoin(self._LOGIN_URL, self._search_regex(
282 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
283 'post url', default=self._LOGIN_URL, group='url'))
284
285 response, urlh = self._download_webpage_handle(
286 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
287 headers={'Referer': self._LOGIN_URL})
288
289 if self._LOGIN_URL in urlh.geturl():
290 error = clean_html(get_element_by_class('form-message', response))
291 if error:
292 raise ExtractorError(
293 'Unable to login: %s' % error, expected=True)
294 raise ExtractorError('Unable to log in')
295
296 def _real_initialize(self):
297 self._login()
298
d12a1a47
S
299 class MediaSelectionError(Exception):
300 def __init__(self, id):
301 self.id = id
302
2e3fd9ec
S
303 def _extract_asx_playlist(self, connection, programme_id):
304 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
305 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
306
2e3fd9ec 307 def _extract_items(self, playlist):
e6174ee9
S
308 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
309
2e3fd9ec 310 def _extract_medias(self, media_selection):
29f7c58a 311 error = media_selection.get('result')
312 if error:
313 raise BBCCoUkIE.MediaSelectionError(error)
314 return media_selection.get('media') or []
2e3fd9ec
S
315
316 def _extract_connections(self, media):
29f7c58a 317 return media.get('connection') or []
2e3fd9ec 318
f13b1e7d 319 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
320 subtitles = {}
321 for connection in self._extract_connections(media):
f0228f56
S
322 cc_url = url_or_none(connection.get('href'))
323 if not cc_url:
324 continue
325 captions = self._download_xml(
326 cc_url, programme_id, 'Downloading captions', fatal=False)
ee0ba927 327 if not isinstance(captions, compat_etree_Element):
f0228f56 328 continue
29f7c58a 329 subtitles['en'] = [
f13b1e7d
JMF
330 {
331 'url': connection.get('href'),
332 'ext': 'ttml',
333 },
f13b1e7d 334 ]
29f7c58a 335 break
2e3fd9ec 336 return subtitles
082c6c86 337
d12a1a47
S
338 def _raise_extractor_error(self, media_selection_error):
339 raise ExtractorError(
340 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
341 expected=True)
342
c056efa2 343 def _download_media_selector(self, programme_id):
d12a1a47 344 last_exception = None
29f7c58a 345 for media_set in self._MEDIA_SETS:
d12a1a47
S
346 try:
347 return self._download_media_selector_url(
29f7c58a 348 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 349 except BBCCoUkIE.MediaSelectionError as e:
d781e293 350 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
351 last_exception = e
352 continue
353 self._raise_extractor_error(e)
354 self._raise_extractor_error(last_exception)
9afa1770
S
355
356 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 357 media_selection = self._download_json(
358 url, programme_id, 'Downloading media selection JSON',
9283d4ea 359 expected_status=(403, 404))
9afa1770 360 return self._process_media_selector(media_selection, programme_id)
082c6c86 361
9afa1770 362 def _process_media_selector(self, media_selection, programme_id):
082c6c86 363 formats = []
2e3fd9ec 364 subtitles = None
b0af1215 365 urls = []
2e3fd9ec 366
c056efa2
S
367 for media in self._extract_medias(media_selection):
368 kind = media.get('kind')
a7e5f274
RA
369 if kind in ('video', 'audio'):
370 bitrate = int_or_none(media.get('bitrate'))
371 encoding = media.get('encoding')
a7e5f274
RA
372 width = int_or_none(media.get('width'))
373 height = int_or_none(media.get('height'))
374 file_size = int_or_none(media.get('media_file_size'))
375 for connection in self._extract_connections(media):
b0af1215
RA
376 href = connection.get('href')
377 if href in urls:
378 continue
379 if href:
380 urls.append(href)
a7e5f274
RA
381 conn_kind = connection.get('kind')
382 protocol = connection.get('protocol')
383 supplier = connection.get('supplier')
a7e5f274
RA
384 transfer_format = connection.get('transferFormat')
385 format_id = supplier or conn_kind or protocol
a7e5f274
RA
386 # ASX playlist
387 if supplier == 'asx':
388 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
389 formats.append({
390 'url': ref,
391 'format_id': 'ref%s_%s' % (i, format_id),
392 })
393 elif transfer_format == 'dash':
394 formats.extend(self._extract_mpd_formats(
395 href, programme_id, mpd_id=format_id, fatal=False))
396 elif transfer_format == 'hls':
397 formats.extend(self._extract_m3u8_formats(
398 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
399 m3u8_id=format_id, fatal=False))
400 elif transfer_format == 'hds':
401 formats.extend(self._extract_f4m_formats(
402 href, programme_id, f4m_id=format_id, fatal=False))
403 else:
29f7c58a 404 if not supplier and bitrate:
aaa42cf0 405 format_id += '-%d' % bitrate
a7e5f274
RA
406 fmt = {
407 'format_id': format_id,
408 'filesize': file_size,
409 }
410 if kind == 'video':
411 fmt.update({
412 'width': width,
413 'height': height,
6240925b 414 'tbr': bitrate,
a7e5f274
RA
415 'vcodec': encoding,
416 })
417 else:
418 fmt.update({
419 'abr': bitrate,
420 'acodec': encoding,
421 'vcodec': 'none',
422 })
1af959ef 423 if protocol in ('http', 'https'):
a7e5f274
RA
424 # Direct link
425 fmt.update({
426 'url': href,
427 })
428 elif protocol == 'rtmp':
429 application = connection.get('application', 'ondemand')
430 auth_string = connection.get('authString')
431 identifier = connection.get('identifier')
432 server = connection.get('server')
433 fmt.update({
434 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
435 'play_path': identifier,
436 'app': '%s?%s' % (application, auth_string),
437 'page_url': 'http://www.bbc.co.uk',
438 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
439 'rtmp_live': False,
440 'ext': 'flv',
441 })
964744af
S
442 else:
443 continue
a7e5f274 444 formats.append(fmt)
c056efa2 445 elif kind == 'captions':
f13b1e7d 446 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 447 return formats, subtitles
2e3fd9ec 448
ae6986fb
S
449 def _download_playlist(self, playlist_id):
450 try:
451 playlist = self._download_json(
452 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
453 playlist_id, 'Downloading playlist JSON')
c45b8741 454 formats = []
455 subtitles = {}
ae6986fb 456
c45b8741 457 for version in playlist.get('allAvailableVersions', []):
ae6986fb
S
458 smp_config = version['smpConfig']
459 title = smp_config['title']
460 description = smp_config['summary']
461 for item in smp_config['items']:
462 kind = item['kind']
40fcba5e 463 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
464 continue
465 programme_id = item.get('vpid')
d97f5cd7 466 duration = int_or_none(item.get('duration'))
c45b8741 467 version_formats, version_subtitles = self._download_media_selector(programme_id)
468 types = version['types']
469 for f in version_formats:
470 f['format_note'] = ', '.join(types)
471 if any('AudioDescribed' in x for x in types):
472 f['language_preference'] = -10
473 formats += version_formats
474 for tag, subformats in (version_subtitles or {}).items():
f304da8a 475 subtitles.setdefault(tag, []).extend(subformats)
c45b8741 476
477 return programme_id, title, description, duration, formats, subtitles
ae6986fb 478 except ExtractorError as ee:
f813928e 479 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
480 raise
481
482 # fallback to legacy playlist
9afa1770
S
483 return self._process_legacy_playlist(playlist_id)
484
485 def _process_legacy_playlist_url(self, url, display_id):
486 playlist = self._download_legacy_playlist_url(url, display_id)
487 return self._extract_from_legacy_playlist(playlist, display_id)
488
489 def _process_legacy_playlist(self, playlist_id):
490 return self._process_legacy_playlist_url(
491 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
492
493 def _download_legacy_playlist_url(self, url, playlist_id=None):
494 return self._download_xml(
495 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 496
9afa1770 497 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 498 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
499 if no_items is not None:
500 reason = no_items.get('reason')
501 if reason == 'preAvailability':
502 msg = 'Episode %s is not yet available' % playlist_id
503 elif reason == 'postAvailability':
504 msg = 'Episode %s is no longer available' % playlist_id
505 elif reason == 'noMedia':
506 msg = 'Episode %s is not currently available' % playlist_id
507 else:
508 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
509 raise ExtractorError(msg, expected=True)
510
511 for item in self._extract_items(playlist):
512 kind = item.get('kind')
40fcba5e 513 if kind not in ('programme', 'radioProgramme'):
ae6986fb 514 continue
e6174ee9
S
515 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
516 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 517 description = description_el.text if description_el is not None else None
9afa1770
S
518
519 def get_programme_id(item):
520 def get_from_attributes(item):
32759325 521 for p in ('identifier', 'group'):
9afa1770
S
522 value = item.get(p)
523 if value and re.match(r'^[pb][\da-z]{7}$', value):
524 return value
525 get_from_attributes(item)
e6174ee9 526 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
527 if mediator is not None:
528 return get_from_attributes(mediator)
529
530 programme_id = get_programme_id(item)
d97f5cd7 531 duration = int_or_none(item.get('duration'))
e6174ee9
S
532
533 if programme_id:
534 formats, subtitles = self._download_media_selector(programme_id)
535 else:
536 formats, subtitles = self._process_media_selector(item, playlist_id)
537 programme_id = playlist_id
ae6986fb
S
538
539 return programme_id, title, description, duration, formats, subtitles
540
c056efa2
S
541 def _real_extract(self, url):
542 group_id = self._match_id(url)
543
544 webpage = self._download_webpage(url, group_id, 'Downloading video page')
545
b2ed954f 546 error = self._search_regex(
29f7c58a 547 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
548 webpage, 'error', default=None)
549 if error:
550 raise ExtractorError(error, expected=True)
551
8683b4d8 552 programme_id = None
679bacf0 553 duration = None
8683b4d8
S
554
555 tviplayer = self._search_regex(
556 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
557 webpage, 'player', default=None)
558
559 if tviplayer:
560 player = self._parse_json(tviplayer, group_id).get('player', {})
561 duration = int_or_none(player.get('duration'))
562 programme_id = player.get('vpid')
563
564 if not programme_id:
565 programme_id = self._search_regex(
22d7368d 566 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 567
c056efa2 568 if programme_id:
c056efa2 569 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 570 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
571 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
572 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 573 description = self._search_regex(
a8534274
S
574 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
575 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
576 webpage, 'description', default=None)
577 if not description:
578 description = self._html_search_meta('description', webpage)
c056efa2 579 else:
ae6986fb 580 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 581
082c6c86
S
582 self._sort_formats(formats)
583
584 return {
2e3fd9ec 585 'id': programme_id,
082c6c86
S
586 'title': title,
587 'description': description,
650cfd0c 588 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
589 'duration': duration,
590 'formats': formats,
2e3fd9ec 591 'subtitles': subtitles,
5f6a1245 592 }
10273d6e 593
594
9afa1770
S
595class BBCIE(BBCCoUkIE):
596 IE_NAME = 'bbc'
597 IE_DESC = 'BBC'
598 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 599
29f7c58a 600 _MEDIA_SETS = [
29f7c58a 601 'pc',
2d997542 602 'mobile-tablet-main',
d12a1a47 603 ]
10273d6e 604
605 _TESTS = [{
6a747190 606 # article with multiple videos embedded with data-playable containing vpids
10273d6e 607 'url': 'http://www.bbc.com/news/world-europe-32668511',
608 'info_dict': {
609 'id': 'world-europe-32668511',
acc86c9a 610 'title': 'Russia stages massive WW2 parade',
9afa1770 611 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 612 },
613 'playlist_count': 2,
a3bfddfa 614 }, {
6a747190 615 # article with multiple videos embedded with data-playable (more videos)
10273d6e 616 'url': 'http://www.bbc.com/news/business-28299555',
617 'info_dict': {
618 'id': 'business-28299555',
619 'title': 'Farnborough Airshow: Video highlights',
9afa1770 620 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 621 },
622 'playlist_count': 9,
9afa1770 623 'skip': 'Save time',
88ed52ae
S
624 }, {
625 # article with multiple videos embedded with `new SMP()`
6a747190 626 # broken
88ed52ae
S
627 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
628 'info_dict': {
629 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 630 'title': 'BUGGER',
88ed52ae
S
631 },
632 'playlist_count': 18,
a3bfddfa 633 }, {
6a747190 634 # single video embedded with data-playable containing vpid
10273d6e 635 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 636 'info_dict': {
637 'id': 'p02mprgb',
55ebae26 638 'ext': 'mp4',
10273d6e 639 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 640 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 641 'duration': 47,
9afa1770 642 'timestamp': 1427219242,
da92eeae 643 'upload_date': '20150324',
10273d6e 644 },
645 'params': {
9afa1770 646 # rtmp download
10273d6e 647 'skip_download': True,
648 }
a3bfddfa 649 }, {
6a747190
S
650 # article with single video embedded with data-playable containing XML playlist
651 # with direct video links as progressiveDownloadUrl (for now these are extracted)
652 # and playlist with f4m and m3u8 as streamingUrl
de939d89 653 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 654 'info_dict': {
9afa1770 655 'id': '150615_telabyad_kentin_cogu',
de939d89 656 'ext': 'mp4',
ad152e2d 657 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 658 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 659 'timestamp': 1434397334,
da92eeae 660 'upload_date': '20150615',
de939d89 661 },
662 'params': {
663 'skip_download': True,
664 }
c936d8cc 665 }, {
6a747190 666 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 667 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 668 'info_dict': {
9afa1770 669 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 670 'ext': 'mp4',
9afa1770 671 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 672 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 673 'timestamp': 1434713142,
da92eeae 674 'upload_date': '20150619',
de939d89 675 },
676 'params': {
677 'skip_download': True,
678 }
a346b1ff
S
679 }, {
680 # single video from video playlist embedded with vxp-playlist-data JSON
681 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
682 'info_dict': {
683 'id': 'p02w6qjc',
55ebae26 684 'ext': 'mp4',
a346b1ff
S
685 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
686 'duration': 56,
0bc4ee60 687 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
688 },
689 'params': {
690 'skip_download': True,
691 }
9afa1770
S
692 }, {
693 # single video story with digitalData
694 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
695 'info_dict': {
696 'id': 'p02q6gc4',
697 'ext': 'flv',
698 'title': 'Sri Lanka’s spicy secret',
699 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
700 'timestamp': 1437674293,
701 'upload_date': '20150723',
702 },
703 'params': {
704 # rtmp download
705 'skip_download': True,
706 }
707 }, {
708 # single video story without digitalData
709 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
710 'info_dict': {
711 'id': 'p018zqqg',
55ebae26 712 'ext': 'mp4',
9afa1770
S
713 'title': 'Hyundai Santa Fe Sport: Rock star',
714 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
715 'timestamp': 1415867444,
716 'upload_date': '20141113',
9afa1770
S
717 },
718 'params': {
719 # rtmp download
720 'skip_download': True,
721 }
9fb64c04
S
722 }, {
723 # single video embedded with Morph
724 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
725 'info_dict': {
726 'id': 'p041vhd0',
727 'ext': 'mp4',
728 'title': "Nigeria v Japan - Men's First Round",
729 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
730 'duration': 7980,
731 'uploader': 'BBC Sport',
732 'uploader_id': 'bbc_sport',
733 },
734 'params': {
735 # m3u8 download
736 'skip_download': True,
9fb64c04
S
737 },
738 'skip': 'Georestricted to UK',
9afa1770 739 }, {
6a747190 740 # single video with playlist.sxml URL in playlist param
9afa1770
S
741 'url': 'http://www.bbc.com/sport/0/football/33653409',
742 'info_dict': {
743 'id': 'p02xycnp',
55ebae26 744 'ext': 'mp4',
9afa1770 745 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 746 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
747 'duration': 140,
748 },
749 'params': {
750 # rtmp download
751 'skip_download': True,
752 }
b5d48cb1 753 }, {
6a747190 754 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
755 'url': 'http://www.bbc.com/sport/0/football/34475836',
756 'info_dict': {
757 'id': '34475836',
450b233c 758 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 759 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
760 },
761 'playlist_count': 3,
450b233c
S
762 }, {
763 # school report article with single video
764 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
765 'info_dict': {
766 'id': '35744779',
767 'title': 'School which breaks down barriers in Jerusalem',
768 },
769 'playlist_count': 1,
9afa1770
S
770 }, {
771 # single video with playlist URL from weather section
772 'url': 'http://www.bbc.com/weather/features/33601775',
773 'only_matching': True,
774 }, {
775 # custom redirection to www.bbc.com
1bdae7d3 776 # also, video with window.__INITIAL_DATA__
9afa1770 777 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
1bdae7d3 778 'info_dict': {
779 'id': 'p02xzws1',
780 'ext': 'mp4',
781 'title': "Pluto may have 'nitrogen glaciers'",
782 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
783 'thumbnail': r're:https?://.+/.+\.jpg',
784 'timestamp': 1437785037,
785 'upload_date': '20150725',
786 },
a1cf3e38
S
787 }, {
788 # single video article embedded with data-media-vpid
789 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
790 'only_matching': True,
6d155707
S
791 }, {
792 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
793 'info_dict': {
794 'id': 'p06556y7',
795 'ext': 'mp4',
796 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
797 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
798 },
799 'params': {
800 'skip_download': True,
801 }
b96b4be4
RA
802 }, {
803 # window.__PRELOADED_STATE__
804 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
805 'info_dict': {
806 'id': 'b0b9z4vz',
807 'ext': 'mp4',
808 'title': 'Prom 6: An American in Paris and Turangalila',
809 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
810 'uploader': 'Radio 3',
811 'uploader_id': 'bbc_radio_three',
812 },
373941c5
S
813 }, {
814 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
815 'info_dict': {
816 'id': 'p06w9tws',
817 'ext': 'mp4',
818 'title': 'md5:2fabf12a726603193a2879a055f72514',
819 'description': 'Learn English words and phrases from this story',
820 },
821 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 822 }, {
823 # BBC Reel
824 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
825 'info_dict': {
826 'id': 'p07c6sb9',
827 'ext': 'mp4',
828 'title': 'How positive thinking is harming your happiness',
829 'alt_title': 'The downsides of positive thinking',
830 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
831 'duration': 235,
832 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
833 'upload_date': '20190604',
834 'categories': ['Psychology'],
835 },
10273d6e 836 }]
837
9afa1770
S
838 @classmethod
839 def suitable(cls, url):
1418a043 840 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
841 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
842 else super(BBCIE, cls).suitable(url))
9afa1770
S
843
844 def _extract_from_media_meta(self, media_meta, video_id):
845 # Direct links to media in media metadata (e.g.
846 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
847 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
848 source_files = media_meta.get('sourceFiles')
849 if source_files:
850 return [{
851 'url': f['url'],
852 'format_id': format_id,
853 'ext': f.get('encoding'),
854 'tbr': float_or_none(f.get('bitrate'), 1000),
855 'filesize': int_or_none(f.get('filesize')),
856 } for format_id, f in source_files.items() if f.get('url')], []
857
858 programme_id = media_meta.get('externalId')
859 if programme_id:
860 return self._download_media_selector(programme_id)
861
862 # Process playlist.sxml as legacy playlist
863 href = media_meta.get('href')
864 if href:
865 playlist = self._download_legacy_playlist_url(href)
866 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
867 return formats, subtitles
868
869 return [], []
870
baf39a1a
S
871 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
872 programme_id, title, description, duration, formats, subtitles = \
873 self._process_legacy_playlist_url(url, playlist_id)
874 self._sort_formats(formats)
875 return {
876 'id': programme_id,
877 'title': title,
878 'description': description,
879 'duration': duration,
880 'timestamp': timestamp,
881 'formats': formats,
882 'subtitles': subtitles,
883 }
884
10273d6e 885 def _real_extract(self, url):
9afa1770
S
886 playlist_id = self._match_id(url)
887
888 webpage = self._download_webpage(url, playlist_id)
889
522f6c06 890 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 891 timestamp = json_ld_info.get('timestamp')
0e832c2c 892
350e02d4 893 playlist_title = json_ld_info.get('title')
0e832c2c
S
894 if not playlist_title:
895 playlist_title = self._og_search_title(
896 webpage, default=None) or self._html_search_regex(
897 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
898 if playlist_title:
899 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
900
901 playlist_description = json_ld_info.get(
902 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
903
904 if not timestamp:
905 timestamp = parse_iso8601(self._search_regex(
906 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
907 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 908 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 909 webpage, 'date', default=None))
9afa1770 910
78f9d843
S
911 entries = []
912
de665713
S
913 # article with multiple videos embedded with playlist.sxml (e.g.
914 # http://www.bbc.com/sport/0/football/34475836)
915 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 916 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 917 if playlists:
baf39a1a
S
918 entries = [
919 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
920 for playlist_url in playlists]
de939d89 921
78f9d843
S
922 # news article with multiple videos embedded with data-playable
923 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
924 if data_playables:
925 for _, data_playable_json in data_playables:
926 data_playable = self._parse_json(
927 unescapeHTML(data_playable_json), playlist_id, fatal=False)
928 if not data_playable:
929 continue
baf39a1a
S
930 settings = data_playable.get('settings', {})
931 if settings:
78f9d843
S
932 # data-playable with video vpid in settings.playlistObject.items (e.g.
933 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
934 playlist_object = settings.get('playlistObject', {})
935 if playlist_object:
936 items = playlist_object.get('items')
937 if items and isinstance(items, list):
78f9d843
S
938 title = playlist_object['title']
939 description = playlist_object.get('summary')
baf39a1a
S
940 duration = int_or_none(items[0].get('duration'))
941 programme_id = items[0].get('vpid')
78f9d843
S
942 formats, subtitles = self._download_media_selector(programme_id)
943 self._sort_formats(formats)
944 entries.append({
945 'id': programme_id,
946 'title': title,
947 'description': description,
948 'timestamp': timestamp,
949 'duration': duration,
950 'formats': formats,
951 'subtitles': subtitles,
952 })
953 else:
954 # data-playable without vpid but with a playlist.sxml URLs
955 # in otherSettings.playlist (e.g.
956 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
957 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
958 if playlist:
a7e5f274
RA
959 entry = None
960 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
961 playlist_url = playlist.get('%sUrl' % key)
962 if not playlist_url:
963 continue
964 try:
a7e5f274
RA
965 info = self._extract_from_playlist_sxml(
966 playlist_url, playlist_id, timestamp)
967 if not entry:
968 entry = info
969 else:
970 entry['title'] = info['title']
971 entry['formats'].extend(info['formats'])
3721515b 972 except ExtractorError as e:
05087d1b
S
973 # Some playlist URL may fail with 500, at the same time
974 # the other one may work fine (e.g.
975 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
976 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
977 continue
978 raise
a7e5f274
RA
979 if entry:
980 self._sort_formats(entry['formats'])
981 entries.append(entry)
78f9d843
S
982
983 if entries:
78f9d843
S
984 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
985
373941c5
S
986 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
987 group_id = self._search_regex(
988 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
989 webpage, 'group id', default=None)
38d70284 990 if group_id:
373941c5
S
991 return self.url_result(
992 'https://www.bbc.co.uk/programmes/%s' % group_id,
993 ie=BBCCoUkIE.ie_key())
994
78f9d843
S
995 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
996 programme_id = self._search_regex(
a1cf3e38 997 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
998 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
999 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 1000 webpage, 'vpid', default=None)
dab062fb 1001
9afa1770
S
1002 if programme_id:
1003 formats, subtitles = self._download_media_selector(programme_id)
1004 self._sort_formats(formats)
1005 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1006 digital_data = self._parse_json(
1007 self._search_regex(
1008 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1009 programme_id, fatal=False)
1010 page_info = digital_data.get('page', {}).get('pageInfo', {})
1011 title = page_info.get('pageName') or self._og_search_title(webpage)
1012 description = page_info.get('description') or self._og_search_description(webpage)
1013 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1014 return {
1015 'id': programme_id,
1016 'title': title,
1017 'description': description,
1018 'timestamp': timestamp,
1019 'formats': formats,
1020 'subtitles': subtitles,
1021 }
a3bfddfa 1022
3721515b 1023 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1024 initial_data = self._parse_json(self._html_search_regex(
1025 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1026 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1027 if initial_data:
1028 init_data = try_get(
1029 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1030 smp_data = init_data.get('smpData') or {}
1031 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1032 version_id = clip_data.get('versionID')
1033 if version_id:
1034 title = smp_data['title']
1035 formats, subtitles = self._download_media_selector(version_id)
1036 self._sort_formats(formats)
1037 image_url = smp_data.get('holdingImageURL')
1038 display_date = init_data.get('displayDate')
1039 topic_title = init_data.get('topicTitle')
1040
1041 return {
1042 'id': version_id,
1043 'title': title,
1044 'formats': formats,
1045 'alt_title': init_data.get('shortTitle'),
1046 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1047 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1048 'upload_date': display_date.replace('-', '') if display_date else None,
1049 'subtitles': subtitles,
1050 'duration': int_or_none(clip_data.get('duration')),
1051 'categories': [topic_title] if topic_title else None,
1052 }
1053
9fb64c04
S
1054 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1055 # There are several setPayload calls may be present but the video
1056 # seems to be always related to the first one
1057 morph_payload = self._parse_json(
1058 self._search_regex(
1059 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1060 webpage, 'morph payload', default='{}'),
1061 playlist_id, fatal=False)
1062 if morph_payload:
1063 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1064 for component in components:
1065 if not isinstance(component, dict):
1066 continue
1067 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1068 if not lead_media:
1069 continue
1070 identifiers = lead_media.get('identifiers')
1071 if not identifiers or not isinstance(identifiers, dict):
1072 continue
1073 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1074 if not programme_id:
1075 continue
1076 title = lead_media.get('title') or self._og_search_title(webpage)
1077 formats, subtitles = self._download_media_selector(programme_id)
1078 self._sort_formats(formats)
1079 description = lead_media.get('summary')
1080 uploader = lead_media.get('masterBrand')
1081 uploader_id = lead_media.get('mid')
1082 duration = None
1083 duration_d = lead_media.get('duration')
1084 if isinstance(duration_d, dict):
1085 duration = parse_duration(dict_get(
1086 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1087 return {
1088 'id': programme_id,
1089 'title': title,
1090 'description': description,
1091 'duration': duration,
1092 'uploader': uploader,
1093 'uploader_id': uploader_id,
1094 'formats': formats,
1095 'subtitles': subtitles,
1096 }
1097
b96b4be4
RA
1098 preload_state = self._parse_json(self._search_regex(
1099 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1100 'preload state', default='{}'), playlist_id, fatal=False)
1101 if preload_state:
1102 current_programme = preload_state.get('programmes', {}).get('current') or {}
1103 programme_id = current_programme.get('id')
1104 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1105 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1106 formats, subtitles = self._download_media_selector(programme_id)
1107 self._sort_formats(formats)
1108 synopses = current_programme.get('synopses') or {}
1109 network = current_programme.get('network') or {}
1110 duration = int_or_none(
1111 current_programme.get('duration', {}).get('value'))
1112 thumbnail = None
1113 image_url = current_programme.get('image_url')
1114 if image_url:
3721515b 1115 thumbnail = image_url.replace('{recipe}', 'raw')
b96b4be4
RA
1116 return {
1117 'id': programme_id,
1118 'title': title,
1119 'description': dict_get(synopses, ('long', 'medium', 'short')),
1120 'thumbnail': thumbnail,
1121 'duration': duration,
1122 'uploader': network.get('short_title'),
1123 'uploader_id': network.get('id'),
1124 'formats': formats,
1125 'subtitles': subtitles,
1126 }
1127
6d155707
S
1128 bbc3_config = self._parse_json(
1129 self._search_regex(
1130 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1131 'bbcthree config', default='{}'),
38d70284 1132 playlist_id, transform_source=js_to_json, fatal=False) or {}
1133 payload = bbc3_config.get('payload') or {}
1134 if payload:
1135 clip = payload.get('currentClip') or {}
1136 clip_vpid = clip.get('vpid')
1137 clip_title = clip.get('title')
1138 if clip_vpid and clip_title:
1139 formats, subtitles = self._download_media_selector(clip_vpid)
1140 self._sort_formats(formats)
1141 return {
1142 'id': clip_vpid,
1143 'title': clip_title,
1144 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1145 'description': clip.get('description'),
1146 'duration': parse_duration(clip.get('duration')),
1147 'formats': formats,
1148 'subtitles': subtitles,
1149 }
6d155707 1150 bbc3_playlist = try_get(
38d70284 1151 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1152 dict)
1153 if bbc3_playlist:
1154 playlist_title = bbc3_playlist.get('title') or playlist_title
1155 thumbnail = bbc3_playlist.get('holdingImageURL')
1156 entries = []
1157 for bbc3_item in bbc3_playlist['items']:
1158 programme_id = bbc3_item.get('versionID')
1159 if not programme_id:
1160 continue
1161 formats, subtitles = self._download_media_selector(programme_id)
1162 self._sort_formats(formats)
1163 entries.append({
1164 'id': programme_id,
1165 'title': playlist_title,
1166 'thumbnail': thumbnail,
1167 'timestamp': timestamp,
1168 'formats': formats,
1169 'subtitles': subtitles,
1170 })
1171 return self.playlist_result(
1172 entries, playlist_id, playlist_title, playlist_description)
1173
ac184ab7 1174 initial_data = self._parse_json(self._parse_json(self._search_regex(
1175 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}");', webpage,
1176 'preload state', default='"{}"'), playlist_id, fatal=False), playlist_id, fatal=False)
38d70284 1177 if initial_data:
1178 def parse_media(media):
1179 if not media:
1180 return
1181 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1182 item_id = item.get('id')
1183 item_title = item.get('title')
1184 if not (item_id and item_title):
1185 continue
1186 formats, subtitles = self._download_media_selector(item_id)
1187 self._sort_formats(formats)
1bdae7d3 1188 item_desc = None
1189 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1190 if blocks:
1191 summary = []
1192 for block in blocks:
1193 text = try_get(block, lambda x: x['model']['text'], compat_str)
1194 if text:
1195 summary.append(text)
1196 if summary:
1197 item_desc = '\n\n'.join(summary)
1198 item_time = None
1199 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1200 if try_get(meta, lambda x: x['label']) == 'Published':
1201 item_time = unified_timestamp(meta.get('timestamp'))
1202 break
38d70284 1203 entries.append({
1204 'id': item_id,
1205 'title': item_title,
1206 'thumbnail': item.get('holdingImageUrl'),
1207 'formats': formats,
1208 'subtitles': subtitles,
1bdae7d3 1209 'timestamp': item_time,
1210 'description': strip_or_none(item_desc),
38d70284 1211 })
1212 for resp in (initial_data.get('data') or {}).values():
1213 name = resp.get('name')
1214 if name == 'media-experience':
1215 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1216 elif name == 'article':
ac184ab7 1217 for block in (try_get(resp, lambda x: x['data']['content']['model']['blocks'], list) or []):
38d70284 1218 if block.get('type') != 'media':
1219 continue
1220 parse_media(block.get('model'))
1221 return self.playlist_result(
1222 entries, playlist_id, playlist_title, playlist_description)
1223
88ed52ae
S
1224 def extract_all(pattern):
1225 return list(filter(None, map(
1226 lambda s: self._parse_json(s, playlist_id, fatal=False),
1227 re.findall(pattern, webpage))))
1228
1229 # Multiple video article (e.g.
1230 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1231 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1232 entries = []
1233 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1234 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1235 if embed_url and re.match(EMBED_URL, embed_url):
1236 entries.append(embed_url)
1237 entries.extend(re.findall(
1238 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1239 if entries:
1240 return self.playlist_result(
aaa42cf0 1241 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1242 playlist_id, playlist_title, playlist_description)
9afa1770
S
1243
1244 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1245 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1246
1247 if not medias:
1248 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1249 media_asset = self._search_regex(
1250 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1251 webpage, 'media asset', default=None)
1252 if media_asset:
1253 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1254 medias = []
1255 for video in media_asset_page.get('videos', {}).values():
1256 medias.extend(video.values())
1257
1258 if not medias:
1259 # Multiple video playlist with single `now playing` entry (e.g.
1260 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1261 vxp_playlist = self._parse_json(
9afa1770 1262 self._search_regex(
a346b1ff
S
1263 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1264 webpage, 'playlist data'),
9afa1770 1265 playlist_id)
a346b1ff
S
1266 playlist_medias = []
1267 for item in vxp_playlist:
1268 media = item.get('media')
1269 if not media:
1270 continue
1271 playlist_medias.append(media)
1272 # Download single video if found media with asset id matching the video id from URL
1273 if item.get('advert', {}).get('assetId') == playlist_id:
1274 medias = [media]
1275 break
1276 # Fallback to the whole playlist
1277 if not medias:
1278 medias = playlist_medias
9afa1770
S
1279
1280 entries = []
1281 for num, media_meta in enumerate(medias, start=1):
1282 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
a06916d9 1283 if not formats and not self.get_param('ignore_no_formats'):
9afa1770 1284 continue
10273d6e 1285 self._sort_formats(formats)
1286
9afa1770
S
1287 video_id = media_meta.get('externalId')
1288 if not video_id:
1289 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1290
1291 title = media_meta.get('caption')
1292 if not title:
1293 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1294
1295 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1296
9afa1770
S
1297 images = []
1298 for image in media_meta.get('images', {}).values():
1299 images.extend(image.values())
1300 if 'image' in media_meta:
1301 images.append(media_meta['image'])
1302
1303 thumbnails = [{
1304 'url': image.get('href'),
1305 'width': int_or_none(image.get('width')),
1306 'height': int_or_none(image.get('height')),
1307 } for image in images]
1308
1309 entries.append({
1310 'id': video_id,
10273d6e 1311 'title': title,
9afa1770 1312 'thumbnails': thumbnails,
10273d6e 1313 'duration': duration,
9afa1770 1314 'timestamp': timestamp,
10273d6e 1315 'formats': formats,
1316 'subtitles': subtitles,
a3bfddfa 1317 })
10273d6e 1318
9afa1770 1319 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1320
1321
1322class BBCCoUkArticleIE(InfoExtractor):
92519402 1323 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1324 IE_NAME = 'bbc.co.uk:article'
1325 IE_DESC = 'BBC articles'
1326
1327 _TEST = {
1328 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1329 'info_dict': {
1330 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1331 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1332 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1333 },
1334 'playlist_count': 4,
1335 'add_ie': ['BBCCoUk'],
1336 }
1337
1338 def _real_extract(self, url):
1339 playlist_id = self._match_id(url)
1340
1341 webpage = self._download_webpage(url, playlist_id)
1342
1343 title = self._og_search_title(webpage)
1344 description = self._og_search_description(webpage).strip()
1345
1346 entries = [self.url_result(programme_url) for programme_url in re.findall(
1347 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1348
1349 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1350
1351
1352class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1353 def _entries(self, webpage, url, playlist_id):
1354 single_page = 'page' in compat_urlparse.parse_qs(
1355 compat_urlparse.urlparse(url).query)
1356 for page_num in itertools.count(2):
1357 for video_id in re.findall(
1358 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1359 yield self.url_result(
1360 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1361 if single_page:
1362 return
1363 next_page = self._search_regex(
1364 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1365 webpage, 'next page url', default=None, group='url')
1366 if not next_page:
1367 break
1368 webpage = self._download_webpage(
1369 compat_urlparse.urljoin(url, next_page), playlist_id,
1370 'Downloading page %d' % page_num, page_num)
1371
ded7511a
S
1372 def _real_extract(self, url):
1373 playlist_id = self._match_id(url)
1374
1375 webpage = self._download_webpage(url, playlist_id)
1376
ded7511a
S
1377 title, description = self._extract_title_and_description(webpage)
1378
254e64a2
S
1379 return self.playlist_result(
1380 self._entries(webpage, url, playlist_id),
1381 playlist_id, title, description)
ded7511a
S
1382
1383
1418a043 1384class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1385 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1386
1387 @staticmethod
1388 def _get_default(episode, key, default_key='default'):
1389 return try_get(episode, lambda x: x[key][default_key])
1390
1391 def _get_description(self, data):
1392 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1393 return dict_get(synopsis, ('large', 'medium', 'small'))
1394
1395 def _fetch_page(self, programme_id, per_page, series_id, page):
1396 elements = self._get_elements(self._call_api(
1397 programme_id, per_page, page + 1, series_id))
1398 for element in elements:
1399 episode = self._get_episode(element)
1400 episode_id = episode.get('id')
1401 if not episode_id:
1402 continue
1403 thumbnail = None
1404 image = self._get_episode_image(episode)
1405 if image:
1406 thumbnail = image.replace('{recipe}', 'raw')
1407 category = self._get_default(episode, 'labels', 'category')
1408 yield {
1409 '_type': 'url',
1410 'id': episode_id,
1411 'title': self._get_episode_field(episode, 'subtitle'),
1412 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1413 'thumbnail': thumbnail,
1414 'description': self._get_description(episode),
1415 'categories': [category] if category else None,
1416 'series': self._get_episode_field(episode, 'title'),
1417 'ie_key': BBCCoUkIE.ie_key(),
1418 }
1419
1420 def _real_extract(self, url):
1421 pid = self._match_id(url)
4dfbf869 1422 qs = parse_qs(url)
1418a043 1423 series_id = qs.get('seriesId', [None])[0]
1424 page = qs.get('page', [None])[0]
1425 per_page = 36 if page else self._PAGE_SIZE
1426 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1427 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1428 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1429 return self.playlist_result(
1430 entries, pid, self._get_playlist_title(playlist_data),
1431 self._get_description(playlist_data))
1432
1433
1434class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1435 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1436 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1437 _TESTS = [{
ded7511a
S
1438 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1439 'info_dict': {
1440 'id': 'b05rcz9v',
1441 'title': 'The Disappearance',
1418a043 1442 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1443 },
1418a043 1444 'playlist_mincount': 8,
9158af16 1445 }, {
1418a043 1446 # all seasons
1447 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1448 'info_dict': {
1449 'id': 'b094m5t9',
1450 'title': 'Doctor Foster',
1451 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1452 },
1453 'playlist_mincount': 10,
1454 }, {
1455 # explicit season
1456 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1457 'info_dict': {
1458 'id': 'b094m5t9',
1459 'title': 'Doctor Foster',
1460 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1461 },
1462 'playlist_mincount': 5,
1463 }, {
1464 # all pages
1465 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1466 'info_dict': {
1467 'id': 'm0004c4v',
1468 'title': 'Beechgrove',
1469 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1470 },
1471 'playlist_mincount': 37,
1472 }, {
1473 # explicit page
1474 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1475 'info_dict': {
1476 'id': 'm0004c4v',
1477 'title': 'Beechgrove',
1478 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1479 },
1480 'playlist_mincount': 1,
1481 }]
1482 _PAGE_SIZE = 100
1483 _DESCRIPTION_KEY = 'synopsis'
1484
1485 def _get_episode_image(self, episode):
1486 return self._get_default(episode, 'image')
1487
1488 def _get_episode_field(self, episode, field):
1489 return self._get_default(episode, field)
1490
1491 @staticmethod
1492 def _get_elements(data):
1493 return data['entities']['results']
1494
1495 @staticmethod
1496 def _get_episode(element):
1497 return element.get('episode') or {}
1498
1499 def _call_api(self, pid, per_page, page=1, series_id=None):
1500 variables = {
1501 'id': pid,
1502 'page': page,
1503 'perPage': per_page,
1504 }
1505 if series_id:
1506 variables['sliceId'] = series_id
1507 return self._download_json(
1508 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1509 'Content-Type': 'application/json'
1510 }, data=json.dumps({
1511 'id': '5692d93d5aac8d796a0305e895e61551',
1512 'variables': variables,
1513 }).encode('utf-8'))['data']['programme']
1514
1515 @staticmethod
1516 def _get_playlist_data(data):
1517 return data
1518
1519 def _get_playlist_title(self, data):
1520 return self._get_default(data, 'title')
1521
1522
1523class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1524 IE_NAME = 'bbc.co.uk:iplayer:group'
1525 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1526 _TESTS = [{
9158af16
S
1527 # Available for over a year unlike 30 days for most other programmes
1528 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1529 'info_dict': {
1530 'id': 'p02tcc32',
1531 'title': 'Bohemian Icons',
1532 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1533 },
1534 'playlist_mincount': 10,
1418a043 1535 }, {
1536 # all pages
1537 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1538 'info_dict': {
1539 'id': 'p081d7j7',
1540 'title': 'Music in Scotland',
1541 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1542 },
1543 'playlist_mincount': 47,
1544 }, {
1545 # explicit page
1546 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1547 'info_dict': {
1548 'id': 'p081d7j7',
1549 'title': 'Music in Scotland',
1550 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1551 },
1552 'playlist_mincount': 11,
9158af16 1553 }]
1418a043 1554 _PAGE_SIZE = 200
1555 _DESCRIPTION_KEY = 'synopses'
1556
1557 def _get_episode_image(self, episode):
1558 return self._get_default(episode, 'images', 'standard')
1559
1560 def _get_episode_field(self, episode, field):
1561 return episode.get(field)
1562
1563 @staticmethod
1564 def _get_elements(data):
1565 return data['elements']
1566
1567 @staticmethod
1568 def _get_episode(element):
1569 return element
1570
1571 def _call_api(self, pid, per_page, page=1, series_id=None):
1572 return self._download_json(
1573 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1574 pid, query={
1575 'page': page,
1576 'per_page': per_page,
1577 })['group_episodes']
1578
1579 @staticmethod
1580 def _get_playlist_data(data):
1581 return data['group']
ded7511a 1582
1418a043 1583 def _get_playlist_title(self, data):
1584 return data.get('title')
ded7511a
S
1585
1586
1587class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1588 IE_NAME = 'bbc.co.uk:playlist'
1589 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1590 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1591 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1592 _TESTS = [{
1593 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1594 'info_dict': {
1595 'id': 'b05rcz9v',
1596 'title': 'The Disappearance - Clips - BBC Four',
1597 'description': 'French thriller serial about a missing teenager.',
1598 },
1599 'playlist_mincount': 7,
4f640f28
S
1600 }, {
1601 # multipage playlist, explicit page
1602 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1603 'info_dict': {
1604 'id': 'b00mfl7n',
1605 'title': 'Frozen Planet - Clips - BBC One',
1606 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1607 },
1608 'playlist_mincount': 24,
1609 }, {
1610 # multipage playlist, all pages
1611 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1612 'info_dict': {
1613 'id': 'b00mfl7n',
1614 'title': 'Frozen Planet - Clips - BBC One',
1615 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1616 },
1617 'playlist_mincount': 142,
ded7511a
S
1618 }, {
1619 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1620 'only_matching': True,
1621 }, {
1622 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1623 'only_matching': True,
1624 }, {
1625 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1626 'only_matching': True,
1627 }]
1628
1629 def _extract_title_and_description(self, webpage):
1630 title = self._og_search_title(webpage, fatal=False)
1631 description = self._og_search_description(webpage)
1632 return title, description