]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[bbccouk] Skip unrecognized formats in media selector (#12701)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
9fb64c04 9 dict_get,
8683b4d8 10 ExtractorError,
9afa1770 11 float_or_none,
8683b4d8 12 int_or_none,
9afa1770
S
13 parse_duration,
14 parse_iso8601,
9fb64c04 15 try_get,
dab062fb 16 unescapeHTML,
8683b4d8 17)
36e6f62c
JMF
18from ..compat import (
19 compat_etree_fromstring,
20 compat_HTTPError,
254e64a2 21 compat_urlparse,
36e6f62c 22)
082c6c86 23
d12a1a47 24
f13b1e7d 25class BBCCoUkIE(InfoExtractor):
082c6c86 26 IE_NAME = 'bbc.co.uk'
2e3fd9ec 27 IE_DESC = 'BBC iPlayer'
22d7368d 28 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
29 _VALID_URL = r'''(?x)
30 https?://
31 (?:www\.)?bbc\.co\.uk/
32 (?:
33 programmes/(?!articles/)|
34 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
35 music/clips[/#]|
36 radio/player/
37 )
ded7511a 38 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 39 ''' % _ID_REGEX
082c6c86 40
d12a1a47 41 _MEDIASELECTOR_URLS = [
26ccc68b
S
42 # Provides HQ HLS streams with even better quality that pc mediaset but fails
43 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 44 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 45 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
46 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
47 ]
a8b081a0 48
e6174ee9
S
49 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
50 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
51
52 _NAMESPACES = (
53 _MEDIASELECTION_NS,
54 _EMP_PLAYLIST_NS,
55 )
56
2e3fd9ec
S
57 _TESTS = [
58 {
f2d0fc68 59 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 60 'info_dict': {
f2d0fc68 61 'id': 'b039d07m',
b1ea6802 62 'ext': 'flv',
679bacf0 63 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 64 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
65 },
66 'params': {
b1ea6802 67 # rtmp download
2e3fd9ec
S
68 'skip_download': True,
69 }
082c6c86 70 },
2e3fd9ec
S
71 {
72 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
73 'info_dict': {
74 'id': 'b00yng1d',
75 'ext': 'flv',
76 'title': 'The Man in Black: Series 3: The Printed Name',
77 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
78 'duration': 1800,
79 },
80 'params': {
81 # rtmp download
82 'skip_download': True,
c7f0177f
S
83 },
84 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
85 },
86 {
87 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
88 'info_dict': {
89 'id': 'b00yng1d',
90 'ext': 'flv',
17968e44 91 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 92 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 93 'duration': 5100,
2e3fd9ec
S
94 },
95 'params': {
96 # rtmp download
97 'skip_download': True,
98 },
b1ea6802 99 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
100 },
101 {
102 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
103 'info_dict': {
104 'id': 'b03k3pb7',
105 'ext': 'flv',
106 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
107 'description': '2. Invasion',
108 'duration': 3600,
109 },
110 'params': {
111 # rtmp download
112 'skip_download': True,
113 },
b1ea6802 114 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
115 }, {
116 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
117 'info_dict': {
118 'id': 'b04v209v',
119 'ext': 'flv',
120 'title': 'Pete Tong, The Essential New Tune Special',
121 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
122 'duration': 10800,
123 },
124 'params': {
125 # rtmp download
126 'skip_download': True,
a3ef0e1c
YCH
127 },
128 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 129 }, {
5aa535c3 130 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
131 'note': 'Audio',
132 'info_dict': {
5aa535c3 133 'id': 'p022h44j',
b1ea6802 134 'ext': 'flv',
5aa535c3
S
135 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
136 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
137 'duration': 227,
c7e67594
S
138 },
139 'params': {
b1ea6802 140 # rtmp download
c7e67594
S
141 'skip_download': True,
142 }
143 }, {
144 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
145 'note': 'Video',
146 'info_dict': {
147 'id': 'p025c103',
b1ea6802 148 'ext': 'flv',
c7e67594
S
149 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
150 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
151 'duration': 226,
152 },
153 'params': {
b1ea6802 154 # rtmp download
c7e67594
S
155 'skip_download': True,
156 }
e68ae99a
S
157 }, {
158 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
159 'info_dict': {
160 'id': 'p02n76xf',
161 'ext': 'flv',
162 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
163 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
164 'duration': 3540,
165 },
166 'params': {
167 # rtmp download
168 'skip_download': True,
169 },
b1ea6802 170 'skip': 'geolocation',
25fa8d66
YCH
171 }, {
172 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
173 'info_dict': {
174 'id': 'b05zmgw1',
175 'ext': 'flv',
176 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
177 'title': 'Royal Academy Summer Exhibition',
178 'duration': 3540,
179 },
180 'params': {
181 # rtmp download
182 'skip_download': True,
183 },
b1ea6802 184 'skip': 'geolocation',
54914380
S
185 }, {
186 # iptv-all mediaset fails with geolocation however there is no geo restriction
187 # for this programme at all
5aa535c3 188 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 189 'info_dict': {
5aa535c3 190 'id': 'b06rkms3',
54914380 191 'ext': 'flv',
5aa535c3
S
192 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
193 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
194 },
195 'params': {
196 # rtmp download
197 'skip_download': True,
198 },
b1ea6802 199 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
200 }, {
201 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
202 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
203 'info_dict': {
204 'id': 'p028bfkj',
b1ea6802 205 'ext': 'flv',
1ac6e794
S
206 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
207 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
208 },
209 'params': {
b1ea6802 210 # rtmp download
1ac6e794
S
211 'skip_download': True,
212 },
31763975
S
213 }, {
214 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
215 'only_matching': True,
c7e67594
S
216 }, {
217 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
218 'only_matching': True,
0692ef86
S
219 }, {
220 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
221 'only_matching': True,
f20a11ed
S
222 }, {
223 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
224 'only_matching': True,
ae6986fb 225 }
2e3fd9ec
S
226 ]
227
97eb9bd2
RA
228 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
229
d12a1a47
S
230 class MediaSelectionError(Exception):
231 def __init__(self, id):
232 self.id = id
233
2e3fd9ec
S
234 def _extract_asx_playlist(self, connection, programme_id):
235 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
236 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
237
2e3fd9ec 238 def _extract_items(self, playlist):
e6174ee9
S
239 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
240
241 def _findall_ns(self, element, xpath):
242 elements = []
243 for ns in self._NAMESPACES:
244 elements.extend(element.findall(xpath % ns))
245 return elements
2e3fd9ec
S
246
247 def _extract_medias(self, media_selection):
e6174ee9
S
248 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
249 if error is None:
250 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 251 if error is not None:
d12a1a47 252 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 253 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
254
255 def _extract_connections(self, media):
e6174ee9 256 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 257
f13b1e7d 258 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
259 subtitles = {}
260 for connection in self._extract_connections(media):
261 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
262 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
263 subtitles[lang] = [
264 {
265 'url': connection.get('href'),
266 'ext': 'ttml',
267 },
f13b1e7d 268 ]
2e3fd9ec 269 return subtitles
082c6c86 270
d12a1a47
S
271 def _raise_extractor_error(self, media_selection_error):
272 raise ExtractorError(
273 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
274 expected=True)
275
c056efa2 276 def _download_media_selector(self, programme_id):
d12a1a47
S
277 last_exception = None
278 for mediaselector_url in self._MEDIASELECTOR_URLS:
279 try:
280 return self._download_media_selector_url(
281 mediaselector_url % programme_id, programme_id)
282 except BBCCoUkIE.MediaSelectionError as e:
d781e293 283 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
284 last_exception = e
285 continue
286 self._raise_extractor_error(e)
287 self._raise_extractor_error(last_exception)
9afa1770
S
288
289 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
290 try:
291 media_selection = self._download_xml(
9afa1770 292 url, programme_id, 'Downloading media selection XML')
c056efa2 293 except ExtractorError as ee:
d781e293 294 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 295 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 296 else:
c056efa2 297 raise
9afa1770 298 return self._process_media_selector(media_selection, programme_id)
082c6c86 299
9afa1770 300 def _process_media_selector(self, media_selection, programme_id):
082c6c86 301 formats = []
2e3fd9ec 302 subtitles = None
b0af1215 303 urls = []
2e3fd9ec 304
c056efa2
S
305 for media in self._extract_medias(media_selection):
306 kind = media.get('kind')
a7e5f274
RA
307 if kind in ('video', 'audio'):
308 bitrate = int_or_none(media.get('bitrate'))
309 encoding = media.get('encoding')
310 service = media.get('service')
311 width = int_or_none(media.get('width'))
312 height = int_or_none(media.get('height'))
313 file_size = int_or_none(media.get('media_file_size'))
314 for connection in self._extract_connections(media):
b0af1215
RA
315 href = connection.get('href')
316 if href in urls:
317 continue
318 if href:
319 urls.append(href)
a7e5f274
RA
320 conn_kind = connection.get('kind')
321 protocol = connection.get('protocol')
322 supplier = connection.get('supplier')
a7e5f274
RA
323 transfer_format = connection.get('transferFormat')
324 format_id = supplier or conn_kind or protocol
325 if service:
326 format_id = '%s_%s' % (service, format_id)
327 # ASX playlist
328 if supplier == 'asx':
329 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
330 formats.append({
331 'url': ref,
332 'format_id': 'ref%s_%s' % (i, format_id),
333 })
334 elif transfer_format == 'dash':
335 formats.extend(self._extract_mpd_formats(
336 href, programme_id, mpd_id=format_id, fatal=False))
337 elif transfer_format == 'hls':
338 formats.extend(self._extract_m3u8_formats(
339 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
340 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
341 if re.search(self._USP_RE, href):
342 usp_formats = self._extract_m3u8_formats(
343 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
344 programme_id, ext='mp4', entry_protocol='m3u8_native',
345 m3u8_id=format_id, fatal=False)
346 for f in usp_formats:
347 if f.get('height') and f['height'] > 720:
348 continue
349 formats.append(f)
a7e5f274
RA
350 elif transfer_format == 'hds':
351 formats.extend(self._extract_f4m_formats(
352 href, programme_id, f4m_id=format_id, fatal=False))
353 else:
f9622868 354 if not service and not supplier and bitrate:
aaa42cf0 355 format_id += '-%d' % bitrate
a7e5f274
RA
356 fmt = {
357 'format_id': format_id,
358 'filesize': file_size,
359 }
360 if kind == 'video':
361 fmt.update({
362 'width': width,
363 'height': height,
364 'vbr': bitrate,
365 'vcodec': encoding,
366 })
367 else:
368 fmt.update({
369 'abr': bitrate,
370 'acodec': encoding,
371 'vcodec': 'none',
372 })
1af959ef 373 if protocol in ('http', 'https'):
a7e5f274
RA
374 # Direct link
375 fmt.update({
376 'url': href,
377 })
378 elif protocol == 'rtmp':
379 application = connection.get('application', 'ondemand')
380 auth_string = connection.get('authString')
381 identifier = connection.get('identifier')
382 server = connection.get('server')
383 fmt.update({
384 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
385 'play_path': identifier,
386 'app': '%s?%s' % (application, auth_string),
387 'page_url': 'http://www.bbc.co.uk',
388 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
389 'rtmp_live': False,
390 'ext': 'flv',
391 })
964744af
S
392 else:
393 continue
a7e5f274 394 formats.append(fmt)
c056efa2 395 elif kind == 'captions':
f13b1e7d 396 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 397 return formats, subtitles
2e3fd9ec 398
ae6986fb
S
399 def _download_playlist(self, playlist_id):
400 try:
401 playlist = self._download_json(
402 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
403 playlist_id, 'Downloading playlist JSON')
404
405 version = playlist.get('defaultAvailableVersion')
406 if version:
407 smp_config = version['smpConfig']
408 title = smp_config['title']
409 description = smp_config['summary']
410 for item in smp_config['items']:
411 kind = item['kind']
412 if kind != 'programme' and kind != 'radioProgramme':
413 continue
414 programme_id = item.get('vpid')
d97f5cd7 415 duration = int_or_none(item.get('duration'))
ae6986fb
S
416 formats, subtitles = self._download_media_selector(programme_id)
417 return programme_id, title, description, duration, formats, subtitles
418 except ExtractorError as ee:
f813928e 419 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
420 raise
421
422 # fallback to legacy playlist
9afa1770
S
423 return self._process_legacy_playlist(playlist_id)
424
425 def _process_legacy_playlist_url(self, url, display_id):
426 playlist = self._download_legacy_playlist_url(url, display_id)
427 return self._extract_from_legacy_playlist(playlist, display_id)
428
429 def _process_legacy_playlist(self, playlist_id):
430 return self._process_legacy_playlist_url(
431 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
432
433 def _download_legacy_playlist_url(self, url, playlist_id=None):
434 return self._download_xml(
435 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 436
9afa1770 437 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 438 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
439 if no_items is not None:
440 reason = no_items.get('reason')
441 if reason == 'preAvailability':
442 msg = 'Episode %s is not yet available' % playlist_id
443 elif reason == 'postAvailability':
444 msg = 'Episode %s is no longer available' % playlist_id
445 elif reason == 'noMedia':
446 msg = 'Episode %s is not currently available' % playlist_id
447 else:
448 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
449 raise ExtractorError(msg, expected=True)
450
451 for item in self._extract_items(playlist):
452 kind = item.get('kind')
453 if kind != 'programme' and kind != 'radioProgramme':
454 continue
e6174ee9
S
455 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
456 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 457 description = description_el.text if description_el is not None else None
9afa1770
S
458
459 def get_programme_id(item):
460 def get_from_attributes(item):
461 for p in('identifier', 'group'):
462 value = item.get(p)
463 if value and re.match(r'^[pb][\da-z]{7}$', value):
464 return value
465 get_from_attributes(item)
e6174ee9 466 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
467 if mediator is not None:
468 return get_from_attributes(mediator)
469
470 programme_id = get_programme_id(item)
d97f5cd7 471 duration = int_or_none(item.get('duration'))
e6174ee9
S
472
473 if programme_id:
474 formats, subtitles = self._download_media_selector(programme_id)
475 else:
476 formats, subtitles = self._process_media_selector(item, playlist_id)
477 programme_id = playlist_id
ae6986fb
S
478
479 return programme_id, title, description, duration, formats, subtitles
480
c056efa2
S
481 def _real_extract(self, url):
482 group_id = self._match_id(url)
483
484 webpage = self._download_webpage(url, group_id, 'Downloading video page')
485
8683b4d8 486 programme_id = None
679bacf0 487 duration = None
8683b4d8
S
488
489 tviplayer = self._search_regex(
490 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
491 webpage, 'player', default=None)
492
493 if tviplayer:
494 player = self._parse_json(tviplayer, group_id).get('player', {})
495 duration = int_or_none(player.get('duration'))
496 programme_id = player.get('vpid')
497
498 if not programme_id:
499 programme_id = self._search_regex(
22d7368d 500 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 501
c056efa2 502 if programme_id:
c056efa2 503 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 504 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
505 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
506 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 507 description = self._search_regex(
a8534274
S
508 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
509 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
510 webpage, 'description', default=None)
511 if not description:
512 description = self._html_search_meta('description', webpage)
c056efa2 513 else:
ae6986fb 514 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 515
082c6c86
S
516 self._sort_formats(formats)
517
518 return {
2e3fd9ec 519 'id': programme_id,
082c6c86
S
520 'title': title,
521 'description': description,
650cfd0c 522 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
523 'duration': duration,
524 'formats': formats,
2e3fd9ec 525 'subtitles': subtitles,
5f6a1245 526 }
10273d6e 527
528
9afa1770
S
529class BBCIE(BBCCoUkIE):
530 IE_NAME = 'bbc'
531 IE_DESC = 'BBC'
532 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 533
d12a1a47 534 _MEDIASELECTOR_URLS = [
55ebae26
S
535 # Provides HQ HLS streams but fails with geolocation in some cases when it's
536 # even not geo restricted at all
537 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
538 # Provides more formats, namely direct mp4 links, but fails on some videos with
539 # notukerror for non UK (?) users (e.g.
540 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
541 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
542 # Provides fewer formats, but works everywhere for everybody (hopefully)
543 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
544 ]
10273d6e 545
546 _TESTS = [{
6a747190 547 # article with multiple videos embedded with data-playable containing vpids
10273d6e 548 'url': 'http://www.bbc.com/news/world-europe-32668511',
549 'info_dict': {
550 'id': 'world-europe-32668511',
551 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 552 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 553 },
554 'playlist_count': 2,
a3bfddfa 555 }, {
6a747190 556 # article with multiple videos embedded with data-playable (more videos)
10273d6e 557 'url': 'http://www.bbc.com/news/business-28299555',
558 'info_dict': {
559 'id': 'business-28299555',
560 'title': 'Farnborough Airshow: Video highlights',
9afa1770 561 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 562 },
563 'playlist_count': 9,
9afa1770 564 'skip': 'Save time',
88ed52ae
S
565 }, {
566 # article with multiple videos embedded with `new SMP()`
6a747190 567 # broken
88ed52ae
S
568 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
569 'info_dict': {
570 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 571 'title': 'BUGGER',
88ed52ae
S
572 },
573 'playlist_count': 18,
a3bfddfa 574 }, {
6a747190 575 # single video embedded with data-playable containing vpid
10273d6e 576 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 577 'info_dict': {
578 'id': 'p02mprgb',
55ebae26 579 'ext': 'mp4',
10273d6e 580 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 581 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 582 'duration': 47,
9afa1770 583 'timestamp': 1427219242,
da92eeae 584 'upload_date': '20150324',
10273d6e 585 },
586 'params': {
9afa1770 587 # rtmp download
10273d6e 588 'skip_download': True,
589 }
a3bfddfa 590 }, {
6a747190
S
591 # article with single video embedded with data-playable containing XML playlist
592 # with direct video links as progressiveDownloadUrl (for now these are extracted)
593 # and playlist with f4m and m3u8 as streamingUrl
de939d89 594 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 595 'info_dict': {
9afa1770 596 'id': '150615_telabyad_kentin_cogu',
de939d89 597 'ext': 'mp4',
ad152e2d 598 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 599 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 600 'timestamp': 1434397334,
da92eeae 601 'upload_date': '20150615',
de939d89 602 },
603 'params': {
604 'skip_download': True,
605 }
c936d8cc 606 }, {
6a747190 607 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 608 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 609 'info_dict': {
9afa1770 610 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 611 'ext': 'mp4',
9afa1770 612 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 613 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 614 'timestamp': 1434713142,
da92eeae 615 'upload_date': '20150619',
de939d89 616 },
617 'params': {
618 'skip_download': True,
619 }
a346b1ff
S
620 }, {
621 # single video from video playlist embedded with vxp-playlist-data JSON
622 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
623 'info_dict': {
624 'id': 'p02w6qjc',
55ebae26 625 'ext': 'mp4',
a346b1ff
S
626 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
627 'duration': 56,
0bc4ee60 628 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
629 },
630 'params': {
631 'skip_download': True,
632 }
9afa1770
S
633 }, {
634 # single video story with digitalData
635 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
636 'info_dict': {
637 'id': 'p02q6gc4',
638 'ext': 'flv',
639 'title': 'Sri Lanka’s spicy secret',
640 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
641 'timestamp': 1437674293,
642 'upload_date': '20150723',
643 },
644 'params': {
645 # rtmp download
646 'skip_download': True,
647 }
648 }, {
649 # single video story without digitalData
650 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
651 'info_dict': {
652 'id': 'p018zqqg',
55ebae26 653 'ext': 'mp4',
9afa1770
S
654 'title': 'Hyundai Santa Fe Sport: Rock star',
655 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
656 'timestamp': 1415867444,
657 'upload_date': '20141113',
9afa1770
S
658 },
659 'params': {
660 # rtmp download
661 'skip_download': True,
662 }
9fb64c04
S
663 }, {
664 # single video embedded with Morph
665 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
666 'info_dict': {
667 'id': 'p041vhd0',
668 'ext': 'mp4',
669 'title': "Nigeria v Japan - Men's First Round",
670 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
671 'duration': 7980,
672 'uploader': 'BBC Sport',
673 'uploader_id': 'bbc_sport',
674 },
675 'params': {
676 # m3u8 download
677 'skip_download': True,
9fb64c04
S
678 },
679 'skip': 'Georestricted to UK',
9afa1770 680 }, {
6a747190 681 # single video with playlist.sxml URL in playlist param
9afa1770
S
682 'url': 'http://www.bbc.com/sport/0/football/33653409',
683 'info_dict': {
684 'id': 'p02xycnp',
55ebae26 685 'ext': 'mp4',
9afa1770 686 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 687 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
688 'duration': 140,
689 },
690 'params': {
691 # rtmp download
692 'skip_download': True,
693 }
b5d48cb1 694 }, {
6a747190 695 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
696 'url': 'http://www.bbc.com/sport/0/football/34475836',
697 'info_dict': {
698 'id': '34475836',
450b233c 699 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 700 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
701 },
702 'playlist_count': 3,
450b233c
S
703 }, {
704 # school report article with single video
705 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
706 'info_dict': {
707 'id': '35744779',
708 'title': 'School which breaks down barriers in Jerusalem',
709 },
710 'playlist_count': 1,
9afa1770
S
711 }, {
712 # single video with playlist URL from weather section
713 'url': 'http://www.bbc.com/weather/features/33601775',
714 'only_matching': True,
715 }, {
716 # custom redirection to www.bbc.com
717 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
718 'only_matching': True,
a1cf3e38
S
719 }, {
720 # single video article embedded with data-media-vpid
721 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
722 'only_matching': True,
10273d6e 723 }]
724
9afa1770
S
725 @classmethod
726 def suitable(cls, url):
ded7511a
S
727 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
728 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
729 else super(BBCIE, cls).suitable(url))
9afa1770
S
730
731 def _extract_from_media_meta(self, media_meta, video_id):
732 # Direct links to media in media metadata (e.g.
733 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
734 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
735 source_files = media_meta.get('sourceFiles')
736 if source_files:
737 return [{
738 'url': f['url'],
739 'format_id': format_id,
740 'ext': f.get('encoding'),
741 'tbr': float_or_none(f.get('bitrate'), 1000),
742 'filesize': int_or_none(f.get('filesize')),
743 } for format_id, f in source_files.items() if f.get('url')], []
744
745 programme_id = media_meta.get('externalId')
746 if programme_id:
747 return self._download_media_selector(programme_id)
748
749 # Process playlist.sxml as legacy playlist
750 href = media_meta.get('href')
751 if href:
752 playlist = self._download_legacy_playlist_url(href)
753 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
754 return formats, subtitles
755
756 return [], []
757
baf39a1a
S
758 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
759 programme_id, title, description, duration, formats, subtitles = \
760 self._process_legacy_playlist_url(url, playlist_id)
761 self._sort_formats(formats)
762 return {
763 'id': programme_id,
764 'title': title,
765 'description': description,
766 'duration': duration,
767 'timestamp': timestamp,
768 'formats': formats,
769 'subtitles': subtitles,
770 }
771
10273d6e 772 def _real_extract(self, url):
9afa1770
S
773 playlist_id = self._match_id(url)
774
775 webpage = self._download_webpage(url, playlist_id)
776
522f6c06 777 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 778 timestamp = json_ld_info.get('timestamp')
0e832c2c 779
350e02d4 780 playlist_title = json_ld_info.get('title')
0e832c2c
S
781 if not playlist_title:
782 playlist_title = self._og_search_title(
783 webpage, default=None) or self._html_search_regex(
784 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
785 if playlist_title:
786 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
787
788 playlist_description = json_ld_info.get(
789 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
790
791 if not timestamp:
792 timestamp = parse_iso8601(self._search_regex(
793 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
794 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 795 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 796 webpage, 'date', default=None))
9afa1770 797
78f9d843
S
798 entries = []
799
de665713
S
800 # article with multiple videos embedded with playlist.sxml (e.g.
801 # http://www.bbc.com/sport/0/football/34475836)
802 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 803 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 804 if playlists:
baf39a1a
S
805 entries = [
806 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
807 for playlist_url in playlists]
de939d89 808
78f9d843
S
809 # news article with multiple videos embedded with data-playable
810 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
811 if data_playables:
812 for _, data_playable_json in data_playables:
813 data_playable = self._parse_json(
814 unescapeHTML(data_playable_json), playlist_id, fatal=False)
815 if not data_playable:
816 continue
baf39a1a
S
817 settings = data_playable.get('settings', {})
818 if settings:
78f9d843
S
819 # data-playable with video vpid in settings.playlistObject.items (e.g.
820 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
821 playlist_object = settings.get('playlistObject', {})
822 if playlist_object:
823 items = playlist_object.get('items')
824 if items and isinstance(items, list):
78f9d843
S
825 title = playlist_object['title']
826 description = playlist_object.get('summary')
baf39a1a
S
827 duration = int_or_none(items[0].get('duration'))
828 programme_id = items[0].get('vpid')
78f9d843
S
829 formats, subtitles = self._download_media_selector(programme_id)
830 self._sort_formats(formats)
831 entries.append({
832 'id': programme_id,
833 'title': title,
834 'description': description,
835 'timestamp': timestamp,
836 'duration': duration,
837 'formats': formats,
838 'subtitles': subtitles,
839 })
840 else:
841 # data-playable without vpid but with a playlist.sxml URLs
842 # in otherSettings.playlist (e.g.
843 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
844 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
845 if playlist:
a7e5f274
RA
846 entry = None
847 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
848 playlist_url = playlist.get('%sUrl' % key)
849 if not playlist_url:
850 continue
851 try:
a7e5f274
RA
852 info = self._extract_from_playlist_sxml(
853 playlist_url, playlist_id, timestamp)
854 if not entry:
855 entry = info
856 else:
857 entry['title'] = info['title']
858 entry['formats'].extend(info['formats'])
05087d1b
S
859 except Exception as e:
860 # Some playlist URL may fail with 500, at the same time
861 # the other one may work fine (e.g.
862 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
863 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
864 continue
865 raise
a7e5f274
RA
866 if entry:
867 self._sort_formats(entry['formats'])
868 entries.append(entry)
78f9d843
S
869
870 if entries:
78f9d843
S
871 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
872
873 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
874 programme_id = self._search_regex(
a1cf3e38 875 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
876 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
877 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 878 webpage, 'vpid', default=None)
dab062fb 879
9afa1770
S
880 if programme_id:
881 formats, subtitles = self._download_media_selector(programme_id)
882 self._sort_formats(formats)
883 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
884 digital_data = self._parse_json(
885 self._search_regex(
886 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
887 programme_id, fatal=False)
888 page_info = digital_data.get('page', {}).get('pageInfo', {})
889 title = page_info.get('pageName') or self._og_search_title(webpage)
890 description = page_info.get('description') or self._og_search_description(webpage)
891 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
892 return {
893 'id': programme_id,
894 'title': title,
895 'description': description,
896 'timestamp': timestamp,
897 'formats': formats,
898 'subtitles': subtitles,
899 }
a3bfddfa 900
9fb64c04
S
901 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
902 # There are several setPayload calls may be present but the video
903 # seems to be always related to the first one
904 morph_payload = self._parse_json(
905 self._search_regex(
906 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
907 webpage, 'morph payload', default='{}'),
908 playlist_id, fatal=False)
909 if morph_payload:
910 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
911 for component in components:
912 if not isinstance(component, dict):
913 continue
914 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
915 if not lead_media:
916 continue
917 identifiers = lead_media.get('identifiers')
918 if not identifiers or not isinstance(identifiers, dict):
919 continue
920 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
921 if not programme_id:
922 continue
923 title = lead_media.get('title') or self._og_search_title(webpage)
924 formats, subtitles = self._download_media_selector(programme_id)
925 self._sort_formats(formats)
926 description = lead_media.get('summary')
927 uploader = lead_media.get('masterBrand')
928 uploader_id = lead_media.get('mid')
929 duration = None
930 duration_d = lead_media.get('duration')
931 if isinstance(duration_d, dict):
932 duration = parse_duration(dict_get(
933 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
934 return {
935 'id': programme_id,
936 'title': title,
937 'description': description,
938 'duration': duration,
939 'uploader': uploader,
940 'uploader_id': uploader_id,
941 'formats': formats,
942 'subtitles': subtitles,
943 }
944
88ed52ae
S
945 def extract_all(pattern):
946 return list(filter(None, map(
947 lambda s: self._parse_json(s, playlist_id, fatal=False),
948 re.findall(pattern, webpage))))
949
950 # Multiple video article (e.g.
951 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 952 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
953 entries = []
954 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
955 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
956 if embed_url and re.match(EMBED_URL, embed_url):
957 entries.append(embed_url)
958 entries.extend(re.findall(
959 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
960 if entries:
961 return self.playlist_result(
aaa42cf0 962 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 963 playlist_id, playlist_title, playlist_description)
9afa1770
S
964
965 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 966 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
967
968 if not medias:
969 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
970 media_asset = self._search_regex(
971 r'mediaAssetPage\.init\(\s*({.+?}), "/',
972 webpage, 'media asset', default=None)
973 if media_asset:
974 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
975 medias = []
976 for video in media_asset_page.get('videos', {}).values():
977 medias.extend(video.values())
978
979 if not medias:
980 # Multiple video playlist with single `now playing` entry (e.g.
981 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
982 vxp_playlist = self._parse_json(
9afa1770 983 self._search_regex(
a346b1ff
S
984 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
985 webpage, 'playlist data'),
9afa1770 986 playlist_id)
a346b1ff
S
987 playlist_medias = []
988 for item in vxp_playlist:
989 media = item.get('media')
990 if not media:
991 continue
992 playlist_medias.append(media)
993 # Download single video if found media with asset id matching the video id from URL
994 if item.get('advert', {}).get('assetId') == playlist_id:
995 medias = [media]
996 break
997 # Fallback to the whole playlist
998 if not medias:
999 medias = playlist_medias
9afa1770
S
1000
1001 entries = []
1002 for num, media_meta in enumerate(medias, start=1):
1003 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1004 if not formats:
1005 continue
10273d6e 1006 self._sort_formats(formats)
1007
9afa1770
S
1008 video_id = media_meta.get('externalId')
1009 if not video_id:
1010 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1011
1012 title = media_meta.get('caption')
1013 if not title:
1014 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1015
1016 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1017
9afa1770
S
1018 images = []
1019 for image in media_meta.get('images', {}).values():
1020 images.extend(image.values())
1021 if 'image' in media_meta:
1022 images.append(media_meta['image'])
1023
1024 thumbnails = [{
1025 'url': image.get('href'),
1026 'width': int_or_none(image.get('width')),
1027 'height': int_or_none(image.get('height')),
1028 } for image in images]
1029
1030 entries.append({
1031 'id': video_id,
10273d6e 1032 'title': title,
9afa1770 1033 'thumbnails': thumbnails,
10273d6e 1034 'duration': duration,
9afa1770 1035 'timestamp': timestamp,
10273d6e 1036 'formats': formats,
1037 'subtitles': subtitles,
a3bfddfa 1038 })
10273d6e 1039
9afa1770 1040 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1041
1042
1043class BBCCoUkArticleIE(InfoExtractor):
92519402 1044 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1045 IE_NAME = 'bbc.co.uk:article'
1046 IE_DESC = 'BBC articles'
1047
1048 _TEST = {
1049 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1050 'info_dict': {
1051 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1052 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1053 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1054 },
1055 'playlist_count': 4,
1056 'add_ie': ['BBCCoUk'],
1057 }
1058
1059 def _real_extract(self, url):
1060 playlist_id = self._match_id(url)
1061
1062 webpage = self._download_webpage(url, playlist_id)
1063
1064 title = self._og_search_title(webpage)
1065 description = self._og_search_description(webpage).strip()
1066
1067 entries = [self.url_result(programme_url) for programme_url in re.findall(
1068 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1069
1070 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1071
1072
1073class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1074 def _entries(self, webpage, url, playlist_id):
1075 single_page = 'page' in compat_urlparse.parse_qs(
1076 compat_urlparse.urlparse(url).query)
1077 for page_num in itertools.count(2):
1078 for video_id in re.findall(
1079 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1080 yield self.url_result(
1081 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1082 if single_page:
1083 return
1084 next_page = self._search_regex(
1085 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1086 webpage, 'next page url', default=None, group='url')
1087 if not next_page:
1088 break
1089 webpage = self._download_webpage(
1090 compat_urlparse.urljoin(url, next_page), playlist_id,
1091 'Downloading page %d' % page_num, page_num)
1092
ded7511a
S
1093 def _real_extract(self, url):
1094 playlist_id = self._match_id(url)
1095
1096 webpage = self._download_webpage(url, playlist_id)
1097
ded7511a
S
1098 title, description = self._extract_title_and_description(webpage)
1099
254e64a2
S
1100 return self.playlist_result(
1101 self._entries(webpage, url, playlist_id),
1102 playlist_id, title, description)
ded7511a
S
1103
1104
1105class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1106 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1107 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1108 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1109 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1110 _TESTS = [{
ded7511a
S
1111 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1112 'info_dict': {
1113 'id': 'b05rcz9v',
1114 'title': 'The Disappearance',
1115 'description': 'French thriller serial about a missing teenager.',
1116 },
1117 'playlist_mincount': 6,
c6668e4a 1118 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1119 }, {
1120 # Available for over a year unlike 30 days for most other programmes
1121 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1122 'info_dict': {
1123 'id': 'p02tcc32',
1124 'title': 'Bohemian Icons',
1125 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1126 },
1127 'playlist_mincount': 10,
1128 }]
ded7511a
S
1129
1130 def _extract_title_and_description(self, webpage):
1131 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1132 description = self._search_regex(
1133 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1134 webpage, 'description', fatal=False, group='value')
1135 return title, description
1136
1137
1138class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1139 IE_NAME = 'bbc.co.uk:playlist'
1140 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1141 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1142 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1143 _TESTS = [{
1144 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1145 'info_dict': {
1146 'id': 'b05rcz9v',
1147 'title': 'The Disappearance - Clips - BBC Four',
1148 'description': 'French thriller serial about a missing teenager.',
1149 },
1150 'playlist_mincount': 7,
4f640f28
S
1151 }, {
1152 # multipage playlist, explicit page
1153 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1154 'info_dict': {
1155 'id': 'b00mfl7n',
1156 'title': 'Frozen Planet - Clips - BBC One',
1157 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1158 },
1159 'playlist_mincount': 24,
1160 }, {
1161 # multipage playlist, all pages
1162 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1163 'info_dict': {
1164 'id': 'b00mfl7n',
1165 'title': 'Frozen Planet - Clips - BBC One',
1166 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1167 },
1168 'playlist_mincount': 142,
ded7511a
S
1169 }, {
1170 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1171 'only_matching': True,
1172 }, {
1173 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1174 'only_matching': True,
1175 }, {
1176 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1177 'only_matching': True,
1178 }]
1179
1180 def _extract_title_and_description(self, webpage):
1181 title = self._og_search_title(webpage, fatal=False)
1182 description = self._og_search_description(webpage)
1183 return title, description