]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/bbc.py
[ie/youtube] Deprioritize iOS client formats (#8337)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
CommitLineData
1418a043 1import functools
254e64a2 2import itertools
1418a043 3import json
f0228f56 4import re
ac668111 5import xml.etree.ElementTree
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
3d2623a8 8from ..compat import compat_str, compat_urlparse
9from ..networking.exceptions import HTTPError
8683b4d8 10from ..utils import (
3721515b 11 ExtractorError,
1418a043 12 OnDemandPagedList,
97067db2 13 clean_html,
9fb64c04 14 dict_get,
9afa1770 15 float_or_none,
97067db2 16 get_element_by_class,
8683b4d8 17 int_or_none,
eda0e415 18 join_nonempty,
6d155707 19 js_to_json,
9afa1770
S
20 parse_duration,
21 parse_iso8601,
4dfbf869 22 parse_qs,
1bdae7d3 23 strip_or_none,
eda0e415 24 traverse_obj,
9fb64c04 25 try_get,
dab062fb 26 unescapeHTML,
1bdae7d3 27 unified_timestamp,
f0228f56 28 url_or_none,
97067db2
S
29 urlencode_postdata,
30 urljoin,
8683b4d8 31)
082c6c86 32
d12a1a47 33
f13b1e7d 34class BBCCoUkIE(InfoExtractor):
082c6c86 35 IE_NAME = 'bbc.co.uk'
2e3fd9ec 36 IE_DESC = 'BBC iPlayer'
50e93e03 37 _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
f20a11ed
S
38 _VALID_URL = r'''(?x)
39 https?://
40 (?:www\.)?bbc\.co\.uk/
41 (?:
42 programmes/(?!articles/)|
43 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 44 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
45 radio/player/|
46 events/[^/]+/play/[^/]+/
f20a11ed 47 )
ded7511a 48 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 49 ''' % _ID_REGEX
bfd973ec 50 _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
082c6c86 51
97067db2
S
52 _LOGIN_URL = 'https://account.bbc.com/signin'
53 _NETRC_MACHINE = 'bbc'
54
29f7c58a 55 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
56 _MEDIA_SETS = [
26ccc68b
S
57 # Provides HQ HLS streams with even better quality that pc mediaset but fails
58 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 59 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
29f7c58a 60 'iptv-all',
61 'pc',
d12a1a47 62 ]
a8b081a0 63
e6174ee9
S
64 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
65
2e3fd9ec
S
66 _TESTS = [
67 {
f2d0fc68 68 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 69 'info_dict': {
f2d0fc68 70 'id': 'b039d07m',
b1ea6802 71 'ext': 'flv',
acc86c9a 72 'title': 'Kaleidoscope, Leonard Cohen',
c4914185 73 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
74 },
75 'params': {
b1ea6802 76 # rtmp download
2e3fd9ec
S
77 'skip_download': True,
78 }
082c6c86 79 },
2e3fd9ec
S
80 {
81 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
82 'info_dict': {
83 'id': 'b00yng1d',
84 'ext': 'flv',
85 'title': 'The Man in Black: Series 3: The Printed Name',
86 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
87 'duration': 1800,
88 },
89 'params': {
90 # rtmp download
91 'skip_download': True,
c7f0177f
S
92 },
93 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
94 },
95 {
96 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
97 'info_dict': {
98 'id': 'b00yng1d',
99 'ext': 'flv',
17968e44 100 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 101 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 102 'duration': 5100,
2e3fd9ec
S
103 },
104 'params': {
105 # rtmp download
106 'skip_download': True,
107 },
b1ea6802 108 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
109 },
110 {
111 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
112 'info_dict': {
113 'id': 'b03k3pb7',
114 'ext': 'flv',
115 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
116 'description': '2. Invasion',
117 'duration': 3600,
118 },
119 'params': {
120 # rtmp download
121 'skip_download': True,
122 },
b1ea6802 123 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
124 }, {
125 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
126 'info_dict': {
127 'id': 'b04v209v',
128 'ext': 'flv',
129 'title': 'Pete Tong, The Essential New Tune Special',
130 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
131 'duration': 10800,
132 },
133 'params': {
134 # rtmp download
135 'skip_download': True,
a3ef0e1c
YCH
136 },
137 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 138 }, {
5aa535c3 139 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
140 'note': 'Audio',
141 'info_dict': {
5aa535c3 142 'id': 'p022h44j',
b1ea6802 143 'ext': 'flv',
5aa535c3
S
144 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
145 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
146 'duration': 227,
c7e67594
S
147 },
148 'params': {
b1ea6802 149 # rtmp download
c7e67594
S
150 'skip_download': True,
151 }
152 }, {
153 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
154 'note': 'Video',
155 'info_dict': {
156 'id': 'p025c103',
b1ea6802 157 'ext': 'flv',
c7e67594
S
158 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
159 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
160 'duration': 226,
161 },
162 'params': {
b1ea6802 163 # rtmp download
c7e67594
S
164 'skip_download': True,
165 }
e68ae99a
S
166 }, {
167 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
168 'info_dict': {
169 'id': 'p02n76xf',
170 'ext': 'flv',
171 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
172 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
173 'duration': 3540,
174 },
175 'params': {
176 # rtmp download
177 'skip_download': True,
178 },
b1ea6802 179 'skip': 'geolocation',
25fa8d66
YCH
180 }, {
181 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
182 'info_dict': {
183 'id': 'b05zmgw1',
184 'ext': 'flv',
185 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
186 'title': 'Royal Academy Summer Exhibition',
187 'duration': 3540,
188 },
189 'params': {
190 # rtmp download
191 'skip_download': True,
192 },
b1ea6802 193 'skip': 'geolocation',
54914380
S
194 }, {
195 # iptv-all mediaset fails with geolocation however there is no geo restriction
196 # for this programme at all
5aa535c3 197 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 198 'info_dict': {
5aa535c3 199 'id': 'b06rkms3',
54914380 200 'ext': 'flv',
5aa535c3
S
201 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
202 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
203 },
204 'params': {
205 # rtmp download
206 'skip_download': True,
207 },
b1ea6802 208 'skip': 'Now it\'s really geo-restricted',
1ac6e794 209 }, {
067aa17e 210 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
1ac6e794
S
211 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
212 'info_dict': {
213 'id': 'p028bfkj',
b1ea6802 214 'ext': 'flv',
1ac6e794
S
215 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
217 },
218 'params': {
b1ea6802 219 # rtmp download
1ac6e794
S
220 'skip_download': True,
221 },
31763975
S
222 }, {
223 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
224 'only_matching': True,
c7e67594
S
225 }, {
226 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
227 'only_matching': True,
0692ef86
S
228 }, {
229 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
230 'only_matching': True,
f20a11ed
S
231 }, {
232 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
233 'only_matching': True,
72d256c4
S
234 }, {
235 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
236 'only_matching': True,
53647dfd
S
237 }, {
238 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
239 'only_matching': True,
6f356cbb
S
240 }, {
241 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
242 'only_matching': True,
243 }, {
244 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
245 'only_matching': True,
72d256c4 246 }]
2e3fd9ec 247
52efa4b3 248 def _perform_login(self, username, password):
97067db2
S
249 login_page = self._download_webpage(
250 self._LOGIN_URL, None, 'Downloading signin page')
251
252 login_form = self._hidden_inputs(login_page)
253
254 login_form.update({
255 'username': username,
256 'password': password,
257 })
258
259 post_url = urljoin(self._LOGIN_URL, self._search_regex(
260 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
261 'post url', default=self._LOGIN_URL, group='url'))
262
263 response, urlh = self._download_webpage_handle(
264 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
265 headers={'Referer': self._LOGIN_URL})
266
3d2623a8 267 if self._LOGIN_URL in urlh.url:
97067db2
S
268 error = clean_html(get_element_by_class('form-message', response))
269 if error:
270 raise ExtractorError(
271 'Unable to login: %s' % error, expected=True)
272 raise ExtractorError('Unable to log in')
273
d12a1a47
S
274 class MediaSelectionError(Exception):
275 def __init__(self, id):
276 self.id = id
277
2e3fd9ec
S
278 def _extract_asx_playlist(self, connection, programme_id):
279 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
280 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
281
2e3fd9ec 282 def _extract_items(self, playlist):
e6174ee9
S
283 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
284
2e3fd9ec 285 def _extract_medias(self, media_selection):
29f7c58a 286 error = media_selection.get('result')
287 if error:
288 raise BBCCoUkIE.MediaSelectionError(error)
289 return media_selection.get('media') or []
2e3fd9ec
S
290
291 def _extract_connections(self, media):
29f7c58a 292 return media.get('connection') or []
2e3fd9ec 293
f13b1e7d 294 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
295 subtitles = {}
296 for connection in self._extract_connections(media):
f0228f56
S
297 cc_url = url_or_none(connection.get('href'))
298 if not cc_url:
299 continue
300 captions = self._download_xml(
301 cc_url, programme_id, 'Downloading captions', fatal=False)
f9934b96 302 if not isinstance(captions, xml.etree.ElementTree.Element):
f0228f56 303 continue
29f7c58a 304 subtitles['en'] = [
f13b1e7d
JMF
305 {
306 'url': connection.get('href'),
307 'ext': 'ttml',
308 },
f13b1e7d 309 ]
29f7c58a 310 break
2e3fd9ec 311 return subtitles
082c6c86 312
d12a1a47
S
313 def _raise_extractor_error(self, media_selection_error):
314 raise ExtractorError(
315 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
316 expected=True)
317
c056efa2 318 def _download_media_selector(self, programme_id):
d12a1a47 319 last_exception = None
29f7c58a 320 for media_set in self._MEDIA_SETS:
d12a1a47
S
321 try:
322 return self._download_media_selector_url(
29f7c58a 323 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
d12a1a47 324 except BBCCoUkIE.MediaSelectionError as e:
d781e293 325 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
326 last_exception = e
327 continue
328 self._raise_extractor_error(e)
329 self._raise_extractor_error(last_exception)
9afa1770
S
330
331 def _download_media_selector_url(self, url, programme_id=None):
29f7c58a 332 media_selection = self._download_json(
333 url, programme_id, 'Downloading media selection JSON',
9283d4ea 334 expected_status=(403, 404))
9afa1770 335 return self._process_media_selector(media_selection, programme_id)
082c6c86 336
9afa1770 337 def _process_media_selector(self, media_selection, programme_id):
082c6c86 338 formats = []
2e3fd9ec 339 subtitles = None
b0af1215 340 urls = []
2e3fd9ec 341
c056efa2
S
342 for media in self._extract_medias(media_selection):
343 kind = media.get('kind')
a7e5f274
RA
344 if kind in ('video', 'audio'):
345 bitrate = int_or_none(media.get('bitrate'))
346 encoding = media.get('encoding')
a7e5f274
RA
347 width = int_or_none(media.get('width'))
348 height = int_or_none(media.get('height'))
349 file_size = int_or_none(media.get('media_file_size'))
350 for connection in self._extract_connections(media):
b0af1215
RA
351 href = connection.get('href')
352 if href in urls:
353 continue
354 if href:
355 urls.append(href)
a7e5f274
RA
356 conn_kind = connection.get('kind')
357 protocol = connection.get('protocol')
358 supplier = connection.get('supplier')
a7e5f274
RA
359 transfer_format = connection.get('transferFormat')
360 format_id = supplier or conn_kind or protocol
a7e5f274
RA
361 # ASX playlist
362 if supplier == 'asx':
363 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
364 formats.append({
365 'url': ref,
366 'format_id': 'ref%s_%s' % (i, format_id),
367 })
368 elif transfer_format == 'dash':
369 formats.extend(self._extract_mpd_formats(
370 href, programme_id, mpd_id=format_id, fatal=False))
371 elif transfer_format == 'hls':
50e93e03 372 # TODO: let expected_status be passed into _extract_xxx_formats() instead
373 try:
374 fmts = self._extract_m3u8_formats(
375 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
376 m3u8_id=format_id, fatal=False)
377 except ExtractorError as e:
3d2623a8 378 if not (isinstance(e.exc_info[1], HTTPError)
379 and e.exc_info[1].status in (403, 404)):
50e93e03 380 raise
381 fmts = []
382 formats.extend(fmts)
a7e5f274
RA
383 elif transfer_format == 'hds':
384 formats.extend(self._extract_f4m_formats(
385 href, programme_id, f4m_id=format_id, fatal=False))
386 else:
29f7c58a 387 if not supplier and bitrate:
aaa42cf0 388 format_id += '-%d' % bitrate
a7e5f274
RA
389 fmt = {
390 'format_id': format_id,
391 'filesize': file_size,
392 }
393 if kind == 'video':
394 fmt.update({
395 'width': width,
396 'height': height,
6240925b 397 'tbr': bitrate,
a7e5f274
RA
398 'vcodec': encoding,
399 })
400 else:
401 fmt.update({
402 'abr': bitrate,
403 'acodec': encoding,
404 'vcodec': 'none',
405 })
1af959ef 406 if protocol in ('http', 'https'):
a7e5f274
RA
407 # Direct link
408 fmt.update({
409 'url': href,
410 })
411 elif protocol == 'rtmp':
412 application = connection.get('application', 'ondemand')
413 auth_string = connection.get('authString')
414 identifier = connection.get('identifier')
415 server = connection.get('server')
416 fmt.update({
417 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
418 'play_path': identifier,
419 'app': '%s?%s' % (application, auth_string),
420 'page_url': 'http://www.bbc.co.uk',
421 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
422 'rtmp_live': False,
423 'ext': 'flv',
424 })
964744af
S
425 else:
426 continue
a7e5f274 427 formats.append(fmt)
c056efa2 428 elif kind == 'captions':
f13b1e7d 429 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 430 return formats, subtitles
2e3fd9ec 431
ae6986fb
S
432 def _download_playlist(self, playlist_id):
433 try:
434 playlist = self._download_json(
435 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
436 playlist_id, 'Downloading playlist JSON')
c45b8741 437 formats = []
438 subtitles = {}
ae6986fb 439
c45b8741 440 for version in playlist.get('allAvailableVersions', []):
ae6986fb
S
441 smp_config = version['smpConfig']
442 title = smp_config['title']
443 description = smp_config['summary']
444 for item in smp_config['items']:
445 kind = item['kind']
40fcba5e 446 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
447 continue
448 programme_id = item.get('vpid')
d97f5cd7 449 duration = int_or_none(item.get('duration'))
c45b8741 450 version_formats, version_subtitles = self._download_media_selector(programme_id)
451 types = version['types']
452 for f in version_formats:
453 f['format_note'] = ', '.join(types)
454 if any('AudioDescribed' in x for x in types):
455 f['language_preference'] = -10
456 formats += version_formats
457 for tag, subformats in (version_subtitles or {}).items():
f304da8a 458 subtitles.setdefault(tag, []).extend(subformats)
c45b8741 459
460 return programme_id, title, description, duration, formats, subtitles
ae6986fb 461 except ExtractorError as ee:
3d2623a8 462 if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
ae6986fb
S
463 raise
464
465 # fallback to legacy playlist
9afa1770
S
466 return self._process_legacy_playlist(playlist_id)
467
468 def _process_legacy_playlist_url(self, url, display_id):
469 playlist = self._download_legacy_playlist_url(url, display_id)
470 return self._extract_from_legacy_playlist(playlist, display_id)
471
472 def _process_legacy_playlist(self, playlist_id):
473 return self._process_legacy_playlist_url(
474 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
475
476 def _download_legacy_playlist_url(self, url, playlist_id=None):
477 return self._download_xml(
478 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 479
9afa1770 480 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 481 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
482 if no_items is not None:
483 reason = no_items.get('reason')
484 if reason == 'preAvailability':
485 msg = 'Episode %s is not yet available' % playlist_id
486 elif reason == 'postAvailability':
487 msg = 'Episode %s is no longer available' % playlist_id
488 elif reason == 'noMedia':
489 msg = 'Episode %s is not currently available' % playlist_id
490 else:
491 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
492 raise ExtractorError(msg, expected=True)
493
494 for item in self._extract_items(playlist):
495 kind = item.get('kind')
40fcba5e 496 if kind not in ('programme', 'radioProgramme'):
ae6986fb 497 continue
e6174ee9
S
498 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
499 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 500 description = description_el.text if description_el is not None else None
9afa1770
S
501
502 def get_programme_id(item):
503 def get_from_attributes(item):
32759325 504 for p in ('identifier', 'group'):
9afa1770
S
505 value = item.get(p)
506 if value and re.match(r'^[pb][\da-z]{7}$', value):
507 return value
508 get_from_attributes(item)
e6174ee9 509 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
510 if mediator is not None:
511 return get_from_attributes(mediator)
512
513 programme_id = get_programme_id(item)
d97f5cd7 514 duration = int_or_none(item.get('duration'))
e6174ee9
S
515
516 if programme_id:
517 formats, subtitles = self._download_media_selector(programme_id)
518 else:
519 formats, subtitles = self._process_media_selector(item, playlist_id)
520 programme_id = playlist_id
ae6986fb
S
521
522 return programme_id, title, description, duration, formats, subtitles
523
c056efa2
S
524 def _real_extract(self, url):
525 group_id = self._match_id(url)
526
527 webpage = self._download_webpage(url, group_id, 'Downloading video page')
528
b2ed954f 529 error = self._search_regex(
29f7c58a 530 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
b2ed954f
S
531 webpage, 'error', default=None)
532 if error:
533 raise ExtractorError(error, expected=True)
534
8683b4d8 535 programme_id = None
679bacf0 536 duration = None
8683b4d8
S
537
538 tviplayer = self._search_regex(
539 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
540 webpage, 'player', default=None)
541
542 if tviplayer:
543 player = self._parse_json(tviplayer, group_id).get('player', {})
544 duration = int_or_none(player.get('duration'))
545 programme_id = player.get('vpid')
546
547 if not programme_id:
548 programme_id = self._search_regex(
22d7368d 549 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 550
c056efa2 551 if programme_id:
c056efa2 552 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 553 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
554 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
555 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 556 description = self._search_regex(
a8534274
S
557 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
558 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
559 webpage, 'description', default=None)
560 if not description:
561 description = self._html_search_meta('description', webpage)
c056efa2 562 else:
ae6986fb 563 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 564
082c6c86 565 return {
2e3fd9ec 566 'id': programme_id,
082c6c86
S
567 'title': title,
568 'description': description,
650cfd0c 569 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
570 'duration': duration,
571 'formats': formats,
2e3fd9ec 572 'subtitles': subtitles,
5f6a1245 573 }
10273d6e 574
575
6368e2e6 576class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
9afa1770
S
577 IE_NAME = 'bbc'
578 IE_DESC = 'BBC'
ed13a772 579 _VALID_URL = r'''(?x)
580 https?://(?:www\.)?(?:
581 bbc\.(?:com|co\.uk)|
582 bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
583 bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
584 )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
10273d6e 585
29f7c58a 586 _MEDIA_SETS = [
29f7c58a 587 'pc',
2d997542 588 'mobile-tablet-main',
d12a1a47 589 ]
10273d6e 590
591 _TESTS = [{
6a747190 592 # article with multiple videos embedded with data-playable containing vpids
10273d6e 593 'url': 'http://www.bbc.com/news/world-europe-32668511',
594 'info_dict': {
595 'id': 'world-europe-32668511',
acc86c9a 596 'title': 'Russia stages massive WW2 parade',
9afa1770 597 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 598 },
599 'playlist_count': 2,
a3bfddfa 600 }, {
6a747190 601 # article with multiple videos embedded with data-playable (more videos)
10273d6e 602 'url': 'http://www.bbc.com/news/business-28299555',
603 'info_dict': {
604 'id': 'business-28299555',
605 'title': 'Farnborough Airshow: Video highlights',
9afa1770 606 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 607 },
608 'playlist_count': 9,
9afa1770 609 'skip': 'Save time',
88ed52ae
S
610 }, {
611 # article with multiple videos embedded with `new SMP()`
6a747190 612 # broken
88ed52ae
S
613 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
614 'info_dict': {
615 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 616 'title': 'BUGGER',
88ed52ae
S
617 },
618 'playlist_count': 18,
a3bfddfa 619 }, {
6a747190 620 # single video embedded with data-playable containing vpid
10273d6e 621 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 622 'info_dict': {
623 'id': 'p02mprgb',
55ebae26 624 'ext': 'mp4',
10273d6e 625 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 626 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 627 'duration': 47,
9afa1770 628 'timestamp': 1427219242,
da92eeae 629 'upload_date': '20150324',
10273d6e 630 },
631 'params': {
9afa1770 632 # rtmp download
10273d6e 633 'skip_download': True,
634 }
a3bfddfa 635 }, {
6a747190
S
636 # article with single video embedded with data-playable containing XML playlist
637 # with direct video links as progressiveDownloadUrl (for now these are extracted)
638 # and playlist with f4m and m3u8 as streamingUrl
de939d89 639 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 640 'info_dict': {
9afa1770 641 'id': '150615_telabyad_kentin_cogu',
de939d89 642 'ext': 'mp4',
ad152e2d 643 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 644 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 645 'timestamp': 1434397334,
da92eeae 646 'upload_date': '20150615',
de939d89 647 },
648 'params': {
649 'skip_download': True,
650 }
c936d8cc 651 }, {
6a747190 652 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 653 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 654 'info_dict': {
9afa1770 655 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 656 'ext': 'mp4',
9afa1770 657 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 658 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 659 'timestamp': 1434713142,
da92eeae 660 'upload_date': '20150619',
de939d89 661 },
662 'params': {
663 'skip_download': True,
664 }
a346b1ff
S
665 }, {
666 # single video from video playlist embedded with vxp-playlist-data JSON
667 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
668 'info_dict': {
669 'id': 'p02w6qjc',
55ebae26 670 'ext': 'mp4',
a346b1ff
S
671 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
672 'duration': 56,
0bc4ee60 673 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
674 },
675 'params': {
676 'skip_download': True,
677 }
9afa1770
S
678 }, {
679 # single video story with digitalData
680 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
681 'info_dict': {
682 'id': 'p02q6gc4',
683 'ext': 'flv',
684 'title': 'Sri Lanka’s spicy secret',
685 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
686 'timestamp': 1437674293,
687 'upload_date': '20150723',
688 },
689 'params': {
690 # rtmp download
691 'skip_download': True,
692 }
693 }, {
694 # single video story without digitalData
695 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
696 'info_dict': {
697 'id': 'p018zqqg',
55ebae26 698 'ext': 'mp4',
9afa1770
S
699 'title': 'Hyundai Santa Fe Sport: Rock star',
700 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
701 'timestamp': 1415867444,
702 'upload_date': '20141113',
9afa1770
S
703 },
704 'params': {
705 # rtmp download
706 'skip_download': True,
707 }
9fb64c04
S
708 }, {
709 # single video embedded with Morph
710 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
711 'info_dict': {
712 'id': 'p041vhd0',
713 'ext': 'mp4',
714 'title': "Nigeria v Japan - Men's First Round",
715 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
716 'duration': 7980,
717 'uploader': 'BBC Sport',
718 'uploader_id': 'bbc_sport',
719 },
720 'params': {
721 # m3u8 download
722 'skip_download': True,
9fb64c04
S
723 },
724 'skip': 'Georestricted to UK',
9afa1770 725 }, {
6a747190 726 # single video with playlist.sxml URL in playlist param
9afa1770
S
727 'url': 'http://www.bbc.com/sport/0/football/33653409',
728 'info_dict': {
729 'id': 'p02xycnp',
55ebae26 730 'ext': 'mp4',
9afa1770 731 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 732 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
733 'duration': 140,
734 },
735 'params': {
736 # rtmp download
737 'skip_download': True,
738 }
b5d48cb1 739 }, {
6a747190 740 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
741 'url': 'http://www.bbc.com/sport/0/football/34475836',
742 'info_dict': {
743 'id': '34475836',
450b233c 744 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 745 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
746 },
747 'playlist_count': 3,
450b233c
S
748 }, {
749 # school report article with single video
750 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
751 'info_dict': {
752 'id': '35744779',
753 'title': 'School which breaks down barriers in Jerusalem',
754 },
755 'playlist_count': 1,
9afa1770
S
756 }, {
757 # single video with playlist URL from weather section
758 'url': 'http://www.bbc.com/weather/features/33601775',
759 'only_matching': True,
760 }, {
761 # custom redirection to www.bbc.com
1bdae7d3 762 # also, video with window.__INITIAL_DATA__
9afa1770 763 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
1bdae7d3 764 'info_dict': {
765 'id': 'p02xzws1',
766 'ext': 'mp4',
767 'title': "Pluto may have 'nitrogen glaciers'",
768 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
769 'thumbnail': r're:https?://.+/.+\.jpg',
770 'timestamp': 1437785037,
771 'upload_date': '20150725',
772 },
50e93e03 773 }, {
774 # video with window.__INITIAL_DATA__ and value as JSON string
775 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
776 'info_dict': {
777 'id': 'p0b71qth',
778 'ext': 'mp4',
779 'title': 'Why France is making this woman a national hero',
780 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
781 'thumbnail': r're:https?://.+/.+\.jpg',
782 'timestamp': 1638230731,
783 'upload_date': '20211130',
784 },
a1cf3e38
S
785 }, {
786 # single video article embedded with data-media-vpid
787 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
788 'only_matching': True,
6d155707 789 }, {
50e93e03 790 # bbcthreeConfig
6d155707
S
791 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
792 'info_dict': {
793 'id': 'p06556y7',
794 'ext': 'mp4',
50e93e03 795 'title': 'Things Not To Say to people that live on council estates',
796 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
797 'duration': 360,
798 'thumbnail': r're:https?://.+/.+\.jpg',
6d155707 799 },
b96b4be4
RA
800 }, {
801 # window.__PRELOADED_STATE__
802 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
803 'info_dict': {
804 'id': 'b0b9z4vz',
805 'ext': 'mp4',
806 'title': 'Prom 6: An American in Paris and Turangalila',
807 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
808 'uploader': 'Radio 3',
809 'uploader_id': 'bbc_radio_three',
810 },
373941c5
S
811 }, {
812 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
813 'info_dict': {
814 'id': 'p06w9tws',
815 'ext': 'mp4',
816 'title': 'md5:2fabf12a726603193a2879a055f72514',
817 'description': 'Learn English words and phrases from this story',
818 },
819 'add_ie': [BBCCoUkIE.ie_key()],
3721515b 820 }, {
821 # BBC Reel
822 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
823 'info_dict': {
824 'id': 'p07c6sb9',
825 'ext': 'mp4',
826 'title': 'How positive thinking is harming your happiness',
827 'alt_title': 'The downsides of positive thinking',
828 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
829 'duration': 235,
830 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
831 'upload_date': '20190604',
832 'categories': ['Psychology'],
833 },
eda0e415 834 }, {
835 # BBC Sounds
836 'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
837 'info_dict': {
838 'id': 'm001q789',
839 'ext': 'mp4',
840 'title': 'The Night Tracks Mix - Music for the darkling hour',
841 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
842 'chapters': 'count:8',
843 'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
844 'uploader': 'Radio 3',
845 'duration': 1800,
846 'uploader_id': 'bbc_radio_three',
847 },
ed13a772 848 }, { # onion routes
849 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
850 'only_matching': True,
851 }, {
852 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
853 'only_matching': True,
10273d6e 854 }]
855
9afa1770
S
856 @classmethod
857 def suitable(cls, url):
1418a043 858 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
ded7511a
S
859 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
860 else super(BBCIE, cls).suitable(url))
9afa1770
S
861
862 def _extract_from_media_meta(self, media_meta, video_id):
863 # Direct links to media in media metadata (e.g.
864 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
865 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
866 source_files = media_meta.get('sourceFiles')
867 if source_files:
868 return [{
869 'url': f['url'],
870 'format_id': format_id,
871 'ext': f.get('encoding'),
872 'tbr': float_or_none(f.get('bitrate'), 1000),
873 'filesize': int_or_none(f.get('filesize')),
874 } for format_id, f in source_files.items() if f.get('url')], []
875
876 programme_id = media_meta.get('externalId')
877 if programme_id:
878 return self._download_media_selector(programme_id)
879
880 # Process playlist.sxml as legacy playlist
881 href = media_meta.get('href')
882 if href:
883 playlist = self._download_legacy_playlist_url(href)
884 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
885 return formats, subtitles
886
887 return [], []
888
baf39a1a
S
889 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
890 programme_id, title, description, duration, formats, subtitles = \
891 self._process_legacy_playlist_url(url, playlist_id)
baf39a1a
S
892 return {
893 'id': programme_id,
894 'title': title,
895 'description': description,
896 'duration': duration,
897 'timestamp': timestamp,
898 'formats': formats,
899 'subtitles': subtitles,
900 }
901
10273d6e 902 def _real_extract(self, url):
9afa1770
S
903 playlist_id = self._match_id(url)
904
905 webpage = self._download_webpage(url, playlist_id)
906
522f6c06 907 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 908 timestamp = json_ld_info.get('timestamp')
0e832c2c 909
62b8dac4 910 playlist_title = json_ld_info.get('title') or re.sub(
911 r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
0e832c2c
S
912
913 playlist_description = json_ld_info.get(
914 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
915
916 if not timestamp:
917 timestamp = parse_iso8601(self._search_regex(
918 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
919 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 920 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 921 webpage, 'date', default=None))
9afa1770 922
78f9d843
S
923 entries = []
924
de665713
S
925 # article with multiple videos embedded with playlist.sxml (e.g.
926 # http://www.bbc.com/sport/0/football/34475836)
927 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 928 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 929 if playlists:
baf39a1a
S
930 entries = [
931 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
932 for playlist_url in playlists]
de939d89 933
78f9d843
S
934 # news article with multiple videos embedded with data-playable
935 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
936 if data_playables:
937 for _, data_playable_json in data_playables:
938 data_playable = self._parse_json(
939 unescapeHTML(data_playable_json), playlist_id, fatal=False)
940 if not data_playable:
941 continue
baf39a1a
S
942 settings = data_playable.get('settings', {})
943 if settings:
78f9d843
S
944 # data-playable with video vpid in settings.playlistObject.items (e.g.
945 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
946 playlist_object = settings.get('playlistObject', {})
947 if playlist_object:
948 items = playlist_object.get('items')
949 if items and isinstance(items, list):
78f9d843
S
950 title = playlist_object['title']
951 description = playlist_object.get('summary')
baf39a1a
S
952 duration = int_or_none(items[0].get('duration'))
953 programme_id = items[0].get('vpid')
78f9d843 954 formats, subtitles = self._download_media_selector(programme_id)
78f9d843
S
955 entries.append({
956 'id': programme_id,
957 'title': title,
958 'description': description,
959 'timestamp': timestamp,
960 'duration': duration,
961 'formats': formats,
962 'subtitles': subtitles,
963 })
964 else:
965 # data-playable without vpid but with a playlist.sxml URLs
966 # in otherSettings.playlist (e.g.
967 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
968 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
969 if playlist:
a7e5f274
RA
970 entry = None
971 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
972 playlist_url = playlist.get('%sUrl' % key)
973 if not playlist_url:
974 continue
975 try:
a7e5f274
RA
976 info = self._extract_from_playlist_sxml(
977 playlist_url, playlist_id, timestamp)
978 if not entry:
979 entry = info
980 else:
981 entry['title'] = info['title']
982 entry['formats'].extend(info['formats'])
3721515b 983 except ExtractorError as e:
05087d1b
S
984 # Some playlist URL may fail with 500, at the same time
985 # the other one may work fine (e.g.
986 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
3d2623a8 987 if isinstance(e.cause, HTTPError) and e.cause.status == 500:
05087d1b
S
988 continue
989 raise
a7e5f274 990 if entry:
a7e5f274 991 entries.append(entry)
78f9d843
S
992
993 if entries:
78f9d843
S
994 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
995
373941c5
S
996 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
997 group_id = self._search_regex(
998 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
999 webpage, 'group id', default=None)
38d70284 1000 if group_id:
373941c5
S
1001 return self.url_result(
1002 'https://www.bbc.co.uk/programmes/%s' % group_id,
1003 ie=BBCCoUkIE.ie_key())
1004
78f9d843
S
1005 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1006 programme_id = self._search_regex(
a1cf3e38 1007 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
1008 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1009 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 1010 webpage, 'vpid', default=None)
dab062fb 1011
9afa1770
S
1012 if programme_id:
1013 formats, subtitles = self._download_media_selector(programme_id)
9afa1770
S
1014 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1015 digital_data = self._parse_json(
1016 self._search_regex(
1017 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1018 programme_id, fatal=False)
1019 page_info = digital_data.get('page', {}).get('pageInfo', {})
1020 title = page_info.get('pageName') or self._og_search_title(webpage)
1021 description = page_info.get('description') or self._og_search_description(webpage)
1022 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1023 return {
1024 'id': programme_id,
1025 'title': title,
1026 'description': description,
1027 'timestamp': timestamp,
1028 'formats': formats,
1029 'subtitles': subtitles,
1030 }
a3bfddfa 1031
3721515b 1032 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1033 initial_data = self._parse_json(self._html_search_regex(
1034 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1035 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1036 if initial_data:
1037 init_data = try_get(
1038 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1039 smp_data = init_data.get('smpData') or {}
1040 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1041 version_id = clip_data.get('versionID')
1042 if version_id:
1043 title = smp_data['title']
1044 formats, subtitles = self._download_media_selector(version_id)
3721515b 1045 image_url = smp_data.get('holdingImageURL')
1046 display_date = init_data.get('displayDate')
1047 topic_title = init_data.get('topicTitle')
1048
1049 return {
1050 'id': version_id,
1051 'title': title,
1052 'formats': formats,
1053 'alt_title': init_data.get('shortTitle'),
1054 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1055 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1056 'upload_date': display_date.replace('-', '') if display_date else None,
1057 'subtitles': subtitles,
1058 'duration': int_or_none(clip_data.get('duration')),
1059 'categories': [topic_title] if topic_title else None,
1060 }
1061
9fb64c04
S
1062 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1063 # There are several setPayload calls may be present but the video
1064 # seems to be always related to the first one
1065 morph_payload = self._parse_json(
1066 self._search_regex(
1067 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1068 webpage, 'morph payload', default='{}'),
1069 playlist_id, fatal=False)
1070 if morph_payload:
1071 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1072 for component in components:
1073 if not isinstance(component, dict):
1074 continue
1075 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1076 if not lead_media:
1077 continue
1078 identifiers = lead_media.get('identifiers')
1079 if not identifiers or not isinstance(identifiers, dict):
1080 continue
1081 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1082 if not programme_id:
1083 continue
1084 title = lead_media.get('title') or self._og_search_title(webpage)
1085 formats, subtitles = self._download_media_selector(programme_id)
9fb64c04
S
1086 description = lead_media.get('summary')
1087 uploader = lead_media.get('masterBrand')
1088 uploader_id = lead_media.get('mid')
1089 duration = None
1090 duration_d = lead_media.get('duration')
1091 if isinstance(duration_d, dict):
1092 duration = parse_duration(dict_get(
1093 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1094 return {
1095 'id': programme_id,
1096 'title': title,
1097 'description': description,
1098 'duration': duration,
1099 'uploader': uploader,
1100 'uploader_id': uploader_id,
1101 'formats': formats,
1102 'subtitles': subtitles,
1103 }
1104
b96b4be4
RA
1105 preload_state = self._parse_json(self._search_regex(
1106 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1107 'preload state', default='{}'), playlist_id, fatal=False)
1108 if preload_state:
1109 current_programme = preload_state.get('programmes', {}).get('current') or {}
1110 programme_id = current_programme.get('id')
1111 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1112 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1113 formats, subtitles = self._download_media_selector(programme_id)
b96b4be4
RA
1114 synopses = current_programme.get('synopses') or {}
1115 network = current_programme.get('network') or {}
1116 duration = int_or_none(
1117 current_programme.get('duration', {}).get('value'))
1118 thumbnail = None
1119 image_url = current_programme.get('image_url')
1120 if image_url:
3721515b 1121 thumbnail = image_url.replace('{recipe}', 'raw')
b96b4be4
RA
1122 return {
1123 'id': programme_id,
1124 'title': title,
1125 'description': dict_get(synopses, ('long', 'medium', 'short')),
1126 'thumbnail': thumbnail,
1127 'duration': duration,
1128 'uploader': network.get('short_title'),
1129 'uploader_id': network.get('id'),
1130 'formats': formats,
1131 'subtitles': subtitles,
eda0e415 1132 'chapters': traverse_obj(preload_state, (
1133 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
1134 'title': ('titles', {lambda x: join_nonempty(
1135 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
1136 'start_time': ('offset', 'start', {float_or_none}),
1137 'end_time': ('offset', 'end', {float_or_none}),
1138 })) or None,
b96b4be4
RA
1139 }
1140
6d155707
S
1141 bbc3_config = self._parse_json(
1142 self._search_regex(
1143 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1144 'bbcthree config', default='{}'),
38d70284 1145 playlist_id, transform_source=js_to_json, fatal=False) or {}
1146 payload = bbc3_config.get('payload') or {}
1147 if payload:
1148 clip = payload.get('currentClip') or {}
1149 clip_vpid = clip.get('vpid')
1150 clip_title = clip.get('title')
1151 if clip_vpid and clip_title:
1152 formats, subtitles = self._download_media_selector(clip_vpid)
38d70284 1153 return {
1154 'id': clip_vpid,
1155 'title': clip_title,
1156 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1157 'description': clip.get('description'),
1158 'duration': parse_duration(clip.get('duration')),
1159 'formats': formats,
1160 'subtitles': subtitles,
1161 }
6d155707 1162 bbc3_playlist = try_get(
38d70284 1163 payload, lambda x: x['content']['bbcMedia']['playlist'],
6d155707
S
1164 dict)
1165 if bbc3_playlist:
1166 playlist_title = bbc3_playlist.get('title') or playlist_title
1167 thumbnail = bbc3_playlist.get('holdingImageURL')
1168 entries = []
1169 for bbc3_item in bbc3_playlist['items']:
1170 programme_id = bbc3_item.get('versionID')
1171 if not programme_id:
1172 continue
1173 formats, subtitles = self._download_media_selector(programme_id)
6d155707
S
1174 entries.append({
1175 'id': programme_id,
1176 'title': playlist_title,
1177 'thumbnail': thumbnail,
1178 'timestamp': timestamp,
1179 'formats': formats,
1180 'subtitles': subtitles,
1181 })
1182 return self.playlist_result(
1183 entries, playlist_id, playlist_title, playlist_description)
1184
50e93e03 1185 initial_data = self._search_regex(
1186 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1187 'quoted preload state', default=None)
1188 if initial_data is None:
1189 initial_data = self._search_regex(
1190 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1191 'preload state', default={})
1192 else:
1193 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1194 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
38d70284 1195 if initial_data:
1196 def parse_media(media):
1197 if not media:
1198 return
1199 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1200 item_id = item.get('id')
1201 item_title = item.get('title')
1202 if not (item_id and item_title):
1203 continue
1204 formats, subtitles = self._download_media_selector(item_id)
1bdae7d3 1205 item_desc = None
1206 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1207 if blocks:
1208 summary = []
1209 for block in blocks:
1210 text = try_get(block, lambda x: x['model']['text'], compat_str)
1211 if text:
1212 summary.append(text)
1213 if summary:
1214 item_desc = '\n\n'.join(summary)
1215 item_time = None
1216 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1217 if try_get(meta, lambda x: x['label']) == 'Published':
1218 item_time = unified_timestamp(meta.get('timestamp'))
1219 break
38d70284 1220 entries.append({
1221 'id': item_id,
1222 'title': item_title,
1223 'thumbnail': item.get('holdingImageUrl'),
1224 'formats': formats,
1225 'subtitles': subtitles,
1bdae7d3 1226 'timestamp': item_time,
1227 'description': strip_or_none(item_desc),
38d70284 1228 })
1229 for resp in (initial_data.get('data') or {}).values():
1230 name = resp.get('name')
1231 if name == 'media-experience':
1232 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1233 elif name == 'article':
50e93e03 1234 for block in (try_get(resp,
1235 (lambda x: x['data']['blocks'],
1236 lambda x: x['data']['content']['model']['blocks'],),
1237 list) or []):
edebb651 1238 if block.get('type') not in ['media', 'video']:
38d70284 1239 continue
1240 parse_media(block.get('model'))
1241 return self.playlist_result(
1242 entries, playlist_id, playlist_title, playlist_description)
1243
88ed52ae
S
1244 def extract_all(pattern):
1245 return list(filter(None, map(
1246 lambda s: self._parse_json(s, playlist_id, fatal=False),
1247 re.findall(pattern, webpage))))
1248
1249 # Multiple video article (e.g.
1250 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1251 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1252 entries = []
1253 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1254 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1255 if embed_url and re.match(EMBED_URL, embed_url):
1256 entries.append(embed_url)
1257 entries.extend(re.findall(
1258 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1259 if entries:
1260 return self.playlist_result(
aaa42cf0 1261 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1262 playlist_id, playlist_title, playlist_description)
9afa1770
S
1263
1264 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1265 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1266
1267 if not medias:
1268 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1269 media_asset = self._search_regex(
1270 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1271 webpage, 'media asset', default=None)
1272 if media_asset:
1273 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1274 medias = []
1275 for video in media_asset_page.get('videos', {}).values():
1276 medias.extend(video.values())
1277
1278 if not medias:
1279 # Multiple video playlist with single `now playing` entry (e.g.
1280 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1281 vxp_playlist = self._parse_json(
9afa1770 1282 self._search_regex(
a346b1ff
S
1283 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1284 webpage, 'playlist data'),
9afa1770 1285 playlist_id)
a346b1ff
S
1286 playlist_medias = []
1287 for item in vxp_playlist:
1288 media = item.get('media')
1289 if not media:
1290 continue
1291 playlist_medias.append(media)
1292 # Download single video if found media with asset id matching the video id from URL
1293 if item.get('advert', {}).get('assetId') == playlist_id:
1294 medias = [media]
1295 break
1296 # Fallback to the whole playlist
1297 if not medias:
1298 medias = playlist_medias
9afa1770
S
1299
1300 entries = []
1301 for num, media_meta in enumerate(medias, start=1):
1302 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
a06916d9 1303 if not formats and not self.get_param('ignore_no_formats'):
9afa1770 1304 continue
10273d6e 1305
9afa1770
S
1306 video_id = media_meta.get('externalId')
1307 if not video_id:
1308 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1309
1310 title = media_meta.get('caption')
1311 if not title:
1312 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1313
1314 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1315
9afa1770
S
1316 images = []
1317 for image in media_meta.get('images', {}).values():
1318 images.extend(image.values())
1319 if 'image' in media_meta:
1320 images.append(media_meta['image'])
1321
1322 thumbnails = [{
1323 'url': image.get('href'),
1324 'width': int_or_none(image.get('width')),
1325 'height': int_or_none(image.get('height')),
1326 } for image in images]
1327
1328 entries.append({
1329 'id': video_id,
10273d6e 1330 'title': title,
9afa1770 1331 'thumbnails': thumbnails,
10273d6e 1332 'duration': duration,
9afa1770 1333 'timestamp': timestamp,
10273d6e 1334 'formats': formats,
1335 'subtitles': subtitles,
a3bfddfa 1336 })
10273d6e 1337
9afa1770 1338 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1339
1340
1341class BBCCoUkArticleIE(InfoExtractor):
92519402 1342 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1343 IE_NAME = 'bbc.co.uk:article'
1344 IE_DESC = 'BBC articles'
1345
1346 _TEST = {
1347 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1348 'info_dict': {
1349 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1350 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1351 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1352 },
1353 'playlist_count': 4,
1354 'add_ie': ['BBCCoUk'],
1355 }
1356
1357 def _real_extract(self, url):
1358 playlist_id = self._match_id(url)
1359
1360 webpage = self._download_webpage(url, playlist_id)
1361
1362 title = self._og_search_title(webpage)
1363 description = self._og_search_description(webpage).strip()
1364
1365 entries = [self.url_result(programme_url) for programme_url in re.findall(
1366 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1367
1368 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1369
1370
1371class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1372 def _entries(self, webpage, url, playlist_id):
1373 single_page = 'page' in compat_urlparse.parse_qs(
1374 compat_urlparse.urlparse(url).query)
1375 for page_num in itertools.count(2):
1376 for video_id in re.findall(
1377 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1378 yield self.url_result(
1379 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1380 if single_page:
1381 return
1382 next_page = self._search_regex(
1383 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1384 webpage, 'next page url', default=None, group='url')
1385 if not next_page:
1386 break
1387 webpage = self._download_webpage(
1388 compat_urlparse.urljoin(url, next_page), playlist_id,
1389 'Downloading page %d' % page_num, page_num)
1390
ded7511a
S
1391 def _real_extract(self, url):
1392 playlist_id = self._match_id(url)
1393
1394 webpage = self._download_webpage(url, playlist_id)
1395
ded7511a
S
1396 title, description = self._extract_title_and_description(webpage)
1397
254e64a2
S
1398 return self.playlist_result(
1399 self._entries(webpage, url, playlist_id),
1400 playlist_id, title, description)
ded7511a
S
1401
1402
1418a043 1403class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1404 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1405
1406 @staticmethod
1407 def _get_default(episode, key, default_key='default'):
1408 return try_get(episode, lambda x: x[key][default_key])
1409
1410 def _get_description(self, data):
1411 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1412 return dict_get(synopsis, ('large', 'medium', 'small'))
1413
1414 def _fetch_page(self, programme_id, per_page, series_id, page):
1415 elements = self._get_elements(self._call_api(
1416 programme_id, per_page, page + 1, series_id))
1417 for element in elements:
1418 episode = self._get_episode(element)
1419 episode_id = episode.get('id')
1420 if not episode_id:
1421 continue
1422 thumbnail = None
1423 image = self._get_episode_image(episode)
1424 if image:
1425 thumbnail = image.replace('{recipe}', 'raw')
1426 category = self._get_default(episode, 'labels', 'category')
1427 yield {
1428 '_type': 'url',
1429 'id': episode_id,
1430 'title': self._get_episode_field(episode, 'subtitle'),
1431 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1432 'thumbnail': thumbnail,
1433 'description': self._get_description(episode),
1434 'categories': [category] if category else None,
1435 'series': self._get_episode_field(episode, 'title'),
1436 'ie_key': BBCCoUkIE.ie_key(),
1437 }
1438
1439 def _real_extract(self, url):
1440 pid = self._match_id(url)
4dfbf869 1441 qs = parse_qs(url)
1418a043 1442 series_id = qs.get('seriesId', [None])[0]
1443 page = qs.get('page', [None])[0]
1444 per_page = 36 if page else self._PAGE_SIZE
1445 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1446 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1447 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1448 return self.playlist_result(
1449 entries, pid, self._get_playlist_title(playlist_data),
1450 self._get_description(playlist_data))
1451
1452
1453class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1454 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1455 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
9158af16 1456 _TESTS = [{
ded7511a
S
1457 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1458 'info_dict': {
1459 'id': 'b05rcz9v',
1460 'title': 'The Disappearance',
1418a043 1461 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
ded7511a 1462 },
1418a043 1463 'playlist_mincount': 8,
9158af16 1464 }, {
1418a043 1465 # all seasons
1466 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1467 'info_dict': {
1468 'id': 'b094m5t9',
1469 'title': 'Doctor Foster',
1470 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1471 },
1472 'playlist_mincount': 10,
1473 }, {
1474 # explicit season
1475 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1476 'info_dict': {
1477 'id': 'b094m5t9',
1478 'title': 'Doctor Foster',
1479 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1480 },
1481 'playlist_mincount': 5,
1482 }, {
1483 # all pages
1484 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1485 'info_dict': {
1486 'id': 'm0004c4v',
1487 'title': 'Beechgrove',
1488 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1489 },
1490 'playlist_mincount': 37,
1491 }, {
1492 # explicit page
1493 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1494 'info_dict': {
1495 'id': 'm0004c4v',
1496 'title': 'Beechgrove',
1497 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1498 },
1499 'playlist_mincount': 1,
1500 }]
1501 _PAGE_SIZE = 100
1502 _DESCRIPTION_KEY = 'synopsis'
1503
1504 def _get_episode_image(self, episode):
1505 return self._get_default(episode, 'image')
1506
1507 def _get_episode_field(self, episode, field):
1508 return self._get_default(episode, field)
1509
1510 @staticmethod
1511 def _get_elements(data):
1512 return data['entities']['results']
1513
1514 @staticmethod
1515 def _get_episode(element):
1516 return element.get('episode') or {}
1517
1518 def _call_api(self, pid, per_page, page=1, series_id=None):
1519 variables = {
1520 'id': pid,
1521 'page': page,
1522 'perPage': per_page,
1523 }
1524 if series_id:
1525 variables['sliceId'] = series_id
1526 return self._download_json(
1527 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1528 'Content-Type': 'application/json'
1529 }, data=json.dumps({
1530 'id': '5692d93d5aac8d796a0305e895e61551',
1531 'variables': variables,
1532 }).encode('utf-8'))['data']['programme']
1533
1534 @staticmethod
1535 def _get_playlist_data(data):
1536 return data
1537
1538 def _get_playlist_title(self, data):
1539 return self._get_default(data, 'title')
1540
1541
1542class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1543 IE_NAME = 'bbc.co.uk:iplayer:group'
1544 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1545 _TESTS = [{
9158af16
S
1546 # Available for over a year unlike 30 days for most other programmes
1547 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1548 'info_dict': {
1549 'id': 'p02tcc32',
1550 'title': 'Bohemian Icons',
1551 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1552 },
1553 'playlist_mincount': 10,
1418a043 1554 }, {
1555 # all pages
1556 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1557 'info_dict': {
1558 'id': 'p081d7j7',
1559 'title': 'Music in Scotland',
1560 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1561 },
1562 'playlist_mincount': 47,
1563 }, {
1564 # explicit page
1565 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1566 'info_dict': {
1567 'id': 'p081d7j7',
1568 'title': 'Music in Scotland',
1569 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1570 },
1571 'playlist_mincount': 11,
9158af16 1572 }]
1418a043 1573 _PAGE_SIZE = 200
1574 _DESCRIPTION_KEY = 'synopses'
1575
1576 def _get_episode_image(self, episode):
1577 return self._get_default(episode, 'images', 'standard')
1578
1579 def _get_episode_field(self, episode, field):
1580 return episode.get(field)
1581
1582 @staticmethod
1583 def _get_elements(data):
1584 return data['elements']
1585
1586 @staticmethod
1587 def _get_episode(element):
1588 return element
1589
1590 def _call_api(self, pid, per_page, page=1, series_id=None):
1591 return self._download_json(
1592 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1593 pid, query={
1594 'page': page,
1595 'per_page': per_page,
1596 })['group_episodes']
1597
1598 @staticmethod
1599 def _get_playlist_data(data):
1600 return data['group']
ded7511a 1601
1418a043 1602 def _get_playlist_title(self, data):
1603 return data.get('title')
ded7511a
S
1604
1605
1606class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1607 IE_NAME = 'bbc.co.uk:playlist'
1608 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1609 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1610 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1611 _TESTS = [{
1612 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1613 'info_dict': {
1614 'id': 'b05rcz9v',
1615 'title': 'The Disappearance - Clips - BBC Four',
1616 'description': 'French thriller serial about a missing teenager.',
1617 },
1618 'playlist_mincount': 7,
4f640f28
S
1619 }, {
1620 # multipage playlist, explicit page
1621 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1622 'info_dict': {
1623 'id': 'b00mfl7n',
1624 'title': 'Frozen Planet - Clips - BBC One',
1625 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1626 },
1627 'playlist_mincount': 24,
1628 }, {
1629 # multipage playlist, all pages
1630 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1631 'info_dict': {
1632 'id': 'b00mfl7n',
1633 'title': 'Frozen Planet - Clips - BBC One',
1634 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1635 },
1636 'playlist_mincount': 142,
ded7511a
S
1637 }, {
1638 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1639 'only_matching': True,
1640 }, {
1641 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1642 'only_matching': True,
1643 }, {
1644 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1645 'only_matching': True,
1646 }]
1647
1648 def _extract_title_and_description(self, webpage):
1649 title = self._og_search_title(webpage, fatal=False)
1650 description = self._og_search_description(webpage)
1651 return title, description