]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[chirbit] Fix extraction (Closes #10296)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
082c6c86 5
f13b1e7d 6from .common import InfoExtractor
8683b4d8 7from ..utils import (
9fb64c04 8 dict_get,
8683b4d8 9 ExtractorError,
9afa1770 10 float_or_none,
8683b4d8 11 int_or_none,
9afa1770
S
12 parse_duration,
13 parse_iso8601,
9fb64c04 14 try_get,
dab062fb 15 unescapeHTML,
8683b4d8 16)
36e6f62c
JMF
17from ..compat import (
18 compat_etree_fromstring,
19 compat_HTTPError,
20)
082c6c86 21
d12a1a47 22
f13b1e7d 23class BBCCoUkIE(InfoExtractor):
082c6c86 24 IE_NAME = 'bbc.co.uk'
2e3fd9ec 25 IE_DESC = 'BBC iPlayer'
22d7368d 26 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
27 _VALID_URL = r'''(?x)
28 https?://
29 (?:www\.)?bbc\.co\.uk/
30 (?:
31 programmes/(?!articles/)|
32 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
33 music/clips[/#]|
34 radio/player/
35 )
ded7511a 36 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 37 ''' % _ID_REGEX
082c6c86 38
d12a1a47 39 _MEDIASELECTOR_URLS = [
26ccc68b
S
40 # Provides HQ HLS streams with even better quality that pc mediaset but fails
41 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 42 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 43 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
44 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
45 ]
a8b081a0 46
e6174ee9
S
47 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
48 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
49
50 _NAMESPACES = (
51 _MEDIASELECTION_NS,
52 _EMP_PLAYLIST_NS,
53 )
54
2e3fd9ec
S
55 _TESTS = [
56 {
f2d0fc68 57 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 58 'info_dict': {
f2d0fc68 59 'id': 'b039d07m',
b1ea6802 60 'ext': 'flv',
679bacf0 61 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 62 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
63 },
64 'params': {
b1ea6802 65 # rtmp download
2e3fd9ec
S
66 'skip_download': True,
67 }
082c6c86 68 },
2e3fd9ec
S
69 {
70 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
71 'info_dict': {
72 'id': 'b00yng1d',
73 'ext': 'flv',
74 'title': 'The Man in Black: Series 3: The Printed Name',
75 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
76 'duration': 1800,
77 },
78 'params': {
79 # rtmp download
80 'skip_download': True,
c7f0177f
S
81 },
82 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
83 },
84 {
85 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
86 'info_dict': {
87 'id': 'b00yng1d',
88 'ext': 'flv',
17968e44 89 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 90 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 91 'duration': 5100,
2e3fd9ec
S
92 },
93 'params': {
94 # rtmp download
95 'skip_download': True,
96 },
b1ea6802 97 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
98 },
99 {
100 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
101 'info_dict': {
102 'id': 'b03k3pb7',
103 'ext': 'flv',
104 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
105 'description': '2. Invasion',
106 'duration': 3600,
107 },
108 'params': {
109 # rtmp download
110 'skip_download': True,
111 },
b1ea6802 112 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
113 }, {
114 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
115 'info_dict': {
116 'id': 'b04v209v',
117 'ext': 'flv',
118 'title': 'Pete Tong, The Essential New Tune Special',
119 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
120 'duration': 10800,
121 },
122 'params': {
123 # rtmp download
124 'skip_download': True,
a3ef0e1c
YCH
125 },
126 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 127 }, {
5aa535c3 128 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
129 'note': 'Audio',
130 'info_dict': {
5aa535c3 131 'id': 'p022h44j',
b1ea6802 132 'ext': 'flv',
5aa535c3
S
133 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
134 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
135 'duration': 227,
c7e67594
S
136 },
137 'params': {
b1ea6802 138 # rtmp download
c7e67594
S
139 'skip_download': True,
140 }
141 }, {
142 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
143 'note': 'Video',
144 'info_dict': {
145 'id': 'p025c103',
b1ea6802 146 'ext': 'flv',
c7e67594
S
147 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
148 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
149 'duration': 226,
150 },
151 'params': {
b1ea6802 152 # rtmp download
c7e67594
S
153 'skip_download': True,
154 }
e68ae99a
S
155 }, {
156 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
157 'info_dict': {
158 'id': 'p02n76xf',
159 'ext': 'flv',
160 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
161 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
162 'duration': 3540,
163 },
164 'params': {
165 # rtmp download
166 'skip_download': True,
167 },
b1ea6802 168 'skip': 'geolocation',
25fa8d66
YCH
169 }, {
170 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
171 'info_dict': {
172 'id': 'b05zmgw1',
173 'ext': 'flv',
174 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
175 'title': 'Royal Academy Summer Exhibition',
176 'duration': 3540,
177 },
178 'params': {
179 # rtmp download
180 'skip_download': True,
181 },
b1ea6802 182 'skip': 'geolocation',
54914380
S
183 }, {
184 # iptv-all mediaset fails with geolocation however there is no geo restriction
185 # for this programme at all
5aa535c3 186 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 187 'info_dict': {
5aa535c3 188 'id': 'b06rkms3',
54914380 189 'ext': 'flv',
5aa535c3
S
190 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
191 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
192 },
193 'params': {
194 # rtmp download
195 'skip_download': True,
196 },
b1ea6802 197 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
198 }, {
199 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
200 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
201 'info_dict': {
202 'id': 'p028bfkj',
b1ea6802 203 'ext': 'flv',
1ac6e794
S
204 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
205 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
206 },
207 'params': {
b1ea6802 208 # rtmp download
1ac6e794
S
209 'skip_download': True,
210 },
31763975
S
211 }, {
212 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
213 'only_matching': True,
c7e67594
S
214 }, {
215 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
216 'only_matching': True,
0692ef86
S
217 }, {
218 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
219 'only_matching': True,
f20a11ed
S
220 }, {
221 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
222 'only_matching': True,
ae6986fb 223 }
2e3fd9ec
S
224 ]
225
d12a1a47
S
226 class MediaSelectionError(Exception):
227 def __init__(self, id):
228 self.id = id
229
2e3fd9ec
S
230 def _extract_asx_playlist(self, connection, programme_id):
231 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
232 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
233
2e3fd9ec 234 def _extract_items(self, playlist):
e6174ee9
S
235 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
236
237 def _findall_ns(self, element, xpath):
238 elements = []
239 for ns in self._NAMESPACES:
240 elements.extend(element.findall(xpath % ns))
241 return elements
2e3fd9ec
S
242
243 def _extract_medias(self, media_selection):
e6174ee9
S
244 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
245 if error is None:
246 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 247 if error is not None:
d12a1a47 248 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 249 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
250
251 def _extract_connections(self, media):
e6174ee9 252 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 253
f13b1e7d 254 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
255 subtitles = {}
256 for connection in self._extract_connections(media):
257 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
258 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
259 subtitles[lang] = [
260 {
261 'url': connection.get('href'),
262 'ext': 'ttml',
263 },
f13b1e7d 264 ]
2e3fd9ec 265 return subtitles
082c6c86 266
d12a1a47
S
267 def _raise_extractor_error(self, media_selection_error):
268 raise ExtractorError(
269 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
270 expected=True)
271
c056efa2 272 def _download_media_selector(self, programme_id):
d12a1a47
S
273 last_exception = None
274 for mediaselector_url in self._MEDIASELECTOR_URLS:
275 try:
276 return self._download_media_selector_url(
277 mediaselector_url % programme_id, programme_id)
278 except BBCCoUkIE.MediaSelectionError as e:
d781e293 279 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
280 last_exception = e
281 continue
282 self._raise_extractor_error(e)
283 self._raise_extractor_error(last_exception)
9afa1770
S
284
285 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
286 try:
287 media_selection = self._download_xml(
9afa1770 288 url, programme_id, 'Downloading media selection XML')
c056efa2 289 except ExtractorError as ee:
d781e293 290 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 291 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 292 else:
c056efa2 293 raise
9afa1770 294 return self._process_media_selector(media_selection, programme_id)
082c6c86 295
9afa1770 296 def _process_media_selector(self, media_selection, programme_id):
082c6c86 297 formats = []
2e3fd9ec 298 subtitles = None
b0af1215 299 urls = []
2e3fd9ec 300
c056efa2
S
301 for media in self._extract_medias(media_selection):
302 kind = media.get('kind')
a7e5f274
RA
303 if kind in ('video', 'audio'):
304 bitrate = int_or_none(media.get('bitrate'))
305 encoding = media.get('encoding')
306 service = media.get('service')
307 width = int_or_none(media.get('width'))
308 height = int_or_none(media.get('height'))
309 file_size = int_or_none(media.get('media_file_size'))
310 for connection in self._extract_connections(media):
b0af1215
RA
311 href = connection.get('href')
312 if href in urls:
313 continue
314 if href:
315 urls.append(href)
a7e5f274
RA
316 conn_kind = connection.get('kind')
317 protocol = connection.get('protocol')
318 supplier = connection.get('supplier')
a7e5f274
RA
319 transfer_format = connection.get('transferFormat')
320 format_id = supplier or conn_kind or protocol
321 if service:
322 format_id = '%s_%s' % (service, format_id)
323 # ASX playlist
324 if supplier == 'asx':
325 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
326 formats.append({
327 'url': ref,
328 'format_id': 'ref%s_%s' % (i, format_id),
329 })
330 elif transfer_format == 'dash':
331 formats.extend(self._extract_mpd_formats(
332 href, programme_id, mpd_id=format_id, fatal=False))
333 elif transfer_format == 'hls':
334 formats.extend(self._extract_m3u8_formats(
335 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
336 m3u8_id=format_id, fatal=False))
337 elif transfer_format == 'hds':
338 formats.extend(self._extract_f4m_formats(
339 href, programme_id, f4m_id=format_id, fatal=False))
340 else:
f9622868 341 if not service and not supplier and bitrate:
aaa42cf0 342 format_id += '-%d' % bitrate
a7e5f274
RA
343 fmt = {
344 'format_id': format_id,
345 'filesize': file_size,
346 }
347 if kind == 'video':
348 fmt.update({
349 'width': width,
350 'height': height,
351 'vbr': bitrate,
352 'vcodec': encoding,
353 })
354 else:
355 fmt.update({
356 'abr': bitrate,
357 'acodec': encoding,
358 'vcodec': 'none',
359 })
360 if protocol == 'http':
361 # Direct link
362 fmt.update({
363 'url': href,
364 })
365 elif protocol == 'rtmp':
366 application = connection.get('application', 'ondemand')
367 auth_string = connection.get('authString')
368 identifier = connection.get('identifier')
369 server = connection.get('server')
370 fmt.update({
371 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
372 'play_path': identifier,
373 'app': '%s?%s' % (application, auth_string),
374 'page_url': 'http://www.bbc.co.uk',
375 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
376 'rtmp_live': False,
377 'ext': 'flv',
378 })
379 formats.append(fmt)
c056efa2 380 elif kind == 'captions':
f13b1e7d 381 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 382 return formats, subtitles
2e3fd9ec 383
ae6986fb
S
384 def _download_playlist(self, playlist_id):
385 try:
386 playlist = self._download_json(
387 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
388 playlist_id, 'Downloading playlist JSON')
389
390 version = playlist.get('defaultAvailableVersion')
391 if version:
392 smp_config = version['smpConfig']
393 title = smp_config['title']
394 description = smp_config['summary']
395 for item in smp_config['items']:
396 kind = item['kind']
397 if kind != 'programme' and kind != 'radioProgramme':
398 continue
399 programme_id = item.get('vpid')
d97f5cd7 400 duration = int_or_none(item.get('duration'))
ae6986fb
S
401 formats, subtitles = self._download_media_selector(programme_id)
402 return programme_id, title, description, duration, formats, subtitles
403 except ExtractorError as ee:
f813928e 404 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
405 raise
406
407 # fallback to legacy playlist
9afa1770
S
408 return self._process_legacy_playlist(playlist_id)
409
410 def _process_legacy_playlist_url(self, url, display_id):
411 playlist = self._download_legacy_playlist_url(url, display_id)
412 return self._extract_from_legacy_playlist(playlist, display_id)
413
414 def _process_legacy_playlist(self, playlist_id):
415 return self._process_legacy_playlist_url(
416 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
417
418 def _download_legacy_playlist_url(self, url, playlist_id=None):
419 return self._download_xml(
420 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 421
9afa1770 422 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 423 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
424 if no_items is not None:
425 reason = no_items.get('reason')
426 if reason == 'preAvailability':
427 msg = 'Episode %s is not yet available' % playlist_id
428 elif reason == 'postAvailability':
429 msg = 'Episode %s is no longer available' % playlist_id
430 elif reason == 'noMedia':
431 msg = 'Episode %s is not currently available' % playlist_id
432 else:
433 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
434 raise ExtractorError(msg, expected=True)
435
436 for item in self._extract_items(playlist):
437 kind = item.get('kind')
438 if kind != 'programme' and kind != 'radioProgramme':
439 continue
e6174ee9
S
440 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
441 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 442 description = description_el.text if description_el is not None else None
9afa1770
S
443
444 def get_programme_id(item):
445 def get_from_attributes(item):
446 for p in('identifier', 'group'):
447 value = item.get(p)
448 if value and re.match(r'^[pb][\da-z]{7}$', value):
449 return value
450 get_from_attributes(item)
e6174ee9 451 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
452 if mediator is not None:
453 return get_from_attributes(mediator)
454
455 programme_id = get_programme_id(item)
d97f5cd7 456 duration = int_or_none(item.get('duration'))
e6174ee9
S
457
458 if programme_id:
459 formats, subtitles = self._download_media_selector(programme_id)
460 else:
461 formats, subtitles = self._process_media_selector(item, playlist_id)
462 programme_id = playlist_id
ae6986fb
S
463
464 return programme_id, title, description, duration, formats, subtitles
465
c056efa2
S
466 def _real_extract(self, url):
467 group_id = self._match_id(url)
468
469 webpage = self._download_webpage(url, group_id, 'Downloading video page')
470
8683b4d8 471 programme_id = None
679bacf0 472 duration = None
8683b4d8
S
473
474 tviplayer = self._search_regex(
475 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
476 webpage, 'player', default=None)
477
478 if tviplayer:
479 player = self._parse_json(tviplayer, group_id).get('player', {})
480 duration = int_or_none(player.get('duration'))
481 programme_id = player.get('vpid')
482
483 if not programme_id:
484 programme_id = self._search_regex(
22d7368d 485 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 486
c056efa2 487 if programme_id:
c056efa2 488 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 489 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
490 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
491 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 492 description = self._search_regex(
a8534274
S
493 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
494 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
495 webpage, 'description', default=None)
496 if not description:
497 description = self._html_search_meta('description', webpage)
c056efa2 498 else:
ae6986fb 499 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 500
082c6c86
S
501 self._sort_formats(formats)
502
503 return {
2e3fd9ec 504 'id': programme_id,
082c6c86
S
505 'title': title,
506 'description': description,
650cfd0c 507 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
508 'duration': duration,
509 'formats': formats,
2e3fd9ec 510 'subtitles': subtitles,
5f6a1245 511 }
10273d6e 512
513
9afa1770
S
514class BBCIE(BBCCoUkIE):
515 IE_NAME = 'bbc'
516 IE_DESC = 'BBC'
517 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 518
d12a1a47 519 _MEDIASELECTOR_URLS = [
55ebae26
S
520 # Provides HQ HLS streams but fails with geolocation in some cases when it's
521 # even not geo restricted at all
522 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
523 # Provides more formats, namely direct mp4 links, but fails on some videos with
524 # notukerror for non UK (?) users (e.g.
525 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
526 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
527 # Provides fewer formats, but works everywhere for everybody (hopefully)
528 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
529 ]
10273d6e 530
531 _TESTS = [{
6a747190 532 # article with multiple videos embedded with data-playable containing vpids
10273d6e 533 'url': 'http://www.bbc.com/news/world-europe-32668511',
534 'info_dict': {
535 'id': 'world-europe-32668511',
536 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 537 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 538 },
539 'playlist_count': 2,
a3bfddfa 540 }, {
6a747190 541 # article with multiple videos embedded with data-playable (more videos)
10273d6e 542 'url': 'http://www.bbc.com/news/business-28299555',
543 'info_dict': {
544 'id': 'business-28299555',
545 'title': 'Farnborough Airshow: Video highlights',
9afa1770 546 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 547 },
548 'playlist_count': 9,
9afa1770 549 'skip': 'Save time',
88ed52ae
S
550 }, {
551 # article with multiple videos embedded with `new SMP()`
6a747190 552 # broken
88ed52ae
S
553 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
554 'info_dict': {
555 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 556 'title': 'BUGGER',
88ed52ae
S
557 },
558 'playlist_count': 18,
a3bfddfa 559 }, {
6a747190 560 # single video embedded with data-playable containing vpid
10273d6e 561 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 562 'info_dict': {
563 'id': 'p02mprgb',
55ebae26 564 'ext': 'mp4',
10273d6e 565 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 566 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 567 'duration': 47,
9afa1770 568 'timestamp': 1427219242,
da92eeae 569 'upload_date': '20150324',
10273d6e 570 },
571 'params': {
9afa1770 572 # rtmp download
10273d6e 573 'skip_download': True,
574 }
a3bfddfa 575 }, {
6a747190
S
576 # article with single video embedded with data-playable containing XML playlist
577 # with direct video links as progressiveDownloadUrl (for now these are extracted)
578 # and playlist with f4m and m3u8 as streamingUrl
de939d89 579 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 580 'info_dict': {
9afa1770 581 'id': '150615_telabyad_kentin_cogu',
de939d89 582 'ext': 'mp4',
ad152e2d 583 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 584 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 585 'timestamp': 1434397334,
da92eeae 586 'upload_date': '20150615',
de939d89 587 },
588 'params': {
589 'skip_download': True,
590 }
c936d8cc 591 }, {
6a747190 592 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 593 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 594 'info_dict': {
9afa1770 595 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 596 'ext': 'mp4',
9afa1770 597 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 598 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 599 'timestamp': 1434713142,
da92eeae 600 'upload_date': '20150619',
de939d89 601 },
602 'params': {
603 'skip_download': True,
604 }
a346b1ff
S
605 }, {
606 # single video from video playlist embedded with vxp-playlist-data JSON
607 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
608 'info_dict': {
609 'id': 'p02w6qjc',
55ebae26 610 'ext': 'mp4',
a346b1ff
S
611 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
612 'duration': 56,
0bc4ee60 613 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
614 },
615 'params': {
616 'skip_download': True,
617 }
9afa1770
S
618 }, {
619 # single video story with digitalData
620 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
621 'info_dict': {
622 'id': 'p02q6gc4',
623 'ext': 'flv',
624 'title': 'Sri Lanka’s spicy secret',
625 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
626 'timestamp': 1437674293,
627 'upload_date': '20150723',
628 },
629 'params': {
630 # rtmp download
631 'skip_download': True,
632 }
633 }, {
634 # single video story without digitalData
635 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
636 'info_dict': {
637 'id': 'p018zqqg',
55ebae26 638 'ext': 'mp4',
9afa1770
S
639 'title': 'Hyundai Santa Fe Sport: Rock star',
640 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
641 'timestamp': 1415867444,
642 'upload_date': '20141113',
9afa1770
S
643 },
644 'params': {
645 # rtmp download
646 'skip_download': True,
647 }
9fb64c04
S
648 }, {
649 # single video embedded with Morph
650 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
651 'info_dict': {
652 'id': 'p041vhd0',
653 'ext': 'mp4',
654 'title': "Nigeria v Japan - Men's First Round",
655 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
656 'duration': 7980,
657 'uploader': 'BBC Sport',
658 'uploader_id': 'bbc_sport',
659 },
660 'params': {
661 # m3u8 download
662 'skip_download': True,
9fb64c04
S
663 },
664 'skip': 'Georestricted to UK',
9afa1770 665 }, {
6a747190 666 # single video with playlist.sxml URL in playlist param
9afa1770
S
667 'url': 'http://www.bbc.com/sport/0/football/33653409',
668 'info_dict': {
669 'id': 'p02xycnp',
55ebae26 670 'ext': 'mp4',
9afa1770 671 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 672 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
673 'duration': 140,
674 },
675 'params': {
676 # rtmp download
677 'skip_download': True,
678 }
b5d48cb1 679 }, {
6a747190 680 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
681 'url': 'http://www.bbc.com/sport/0/football/34475836',
682 'info_dict': {
683 'id': '34475836',
450b233c 684 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 685 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
686 },
687 'playlist_count': 3,
450b233c
S
688 }, {
689 # school report article with single video
690 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
691 'info_dict': {
692 'id': '35744779',
693 'title': 'School which breaks down barriers in Jerusalem',
694 },
695 'playlist_count': 1,
9afa1770
S
696 }, {
697 # single video with playlist URL from weather section
698 'url': 'http://www.bbc.com/weather/features/33601775',
699 'only_matching': True,
700 }, {
701 # custom redirection to www.bbc.com
702 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
703 'only_matching': True,
a1cf3e38
S
704 }, {
705 # single video article embedded with data-media-vpid
706 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
707 'only_matching': True,
10273d6e 708 }]
709
9afa1770
S
710 @classmethod
711 def suitable(cls, url):
ded7511a
S
712 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
713 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
714 else super(BBCIE, cls).suitable(url))
9afa1770
S
715
716 def _extract_from_media_meta(self, media_meta, video_id):
717 # Direct links to media in media metadata (e.g.
718 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
719 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
720 source_files = media_meta.get('sourceFiles')
721 if source_files:
722 return [{
723 'url': f['url'],
724 'format_id': format_id,
725 'ext': f.get('encoding'),
726 'tbr': float_or_none(f.get('bitrate'), 1000),
727 'filesize': int_or_none(f.get('filesize')),
728 } for format_id, f in source_files.items() if f.get('url')], []
729
730 programme_id = media_meta.get('externalId')
731 if programme_id:
732 return self._download_media_selector(programme_id)
733
734 # Process playlist.sxml as legacy playlist
735 href = media_meta.get('href')
736 if href:
737 playlist = self._download_legacy_playlist_url(href)
738 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
739 return formats, subtitles
740
741 return [], []
742
baf39a1a
S
743 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
744 programme_id, title, description, duration, formats, subtitles = \
745 self._process_legacy_playlist_url(url, playlist_id)
746 self._sort_formats(formats)
747 return {
748 'id': programme_id,
749 'title': title,
750 'description': description,
751 'duration': duration,
752 'timestamp': timestamp,
753 'formats': formats,
754 'subtitles': subtitles,
755 }
756
10273d6e 757 def _real_extract(self, url):
9afa1770
S
758 playlist_id = self._match_id(url)
759
760 webpage = self._download_webpage(url, playlist_id)
761
522f6c06 762 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 763 timestamp = json_ld_info.get('timestamp')
0e832c2c 764
350e02d4 765 playlist_title = json_ld_info.get('title')
0e832c2c
S
766 if not playlist_title:
767 playlist_title = self._og_search_title(
768 webpage, default=None) or self._html_search_regex(
769 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
770 if playlist_title:
771 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
772
773 playlist_description = json_ld_info.get(
774 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
775
776 if not timestamp:
777 timestamp = parse_iso8601(self._search_regex(
778 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
779 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 780 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 781 webpage, 'date', default=None))
9afa1770 782
78f9d843
S
783 entries = []
784
de665713
S
785 # article with multiple videos embedded with playlist.sxml (e.g.
786 # http://www.bbc.com/sport/0/football/34475836)
787 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 788 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 789 if playlists:
baf39a1a
S
790 entries = [
791 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
792 for playlist_url in playlists]
de939d89 793
78f9d843
S
794 # news article with multiple videos embedded with data-playable
795 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
796 if data_playables:
797 for _, data_playable_json in data_playables:
798 data_playable = self._parse_json(
799 unescapeHTML(data_playable_json), playlist_id, fatal=False)
800 if not data_playable:
801 continue
baf39a1a
S
802 settings = data_playable.get('settings', {})
803 if settings:
78f9d843
S
804 # data-playable with video vpid in settings.playlistObject.items (e.g.
805 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
806 playlist_object = settings.get('playlistObject', {})
807 if playlist_object:
808 items = playlist_object.get('items')
809 if items and isinstance(items, list):
78f9d843
S
810 title = playlist_object['title']
811 description = playlist_object.get('summary')
baf39a1a
S
812 duration = int_or_none(items[0].get('duration'))
813 programme_id = items[0].get('vpid')
78f9d843
S
814 formats, subtitles = self._download_media_selector(programme_id)
815 self._sort_formats(formats)
816 entries.append({
817 'id': programme_id,
818 'title': title,
819 'description': description,
820 'timestamp': timestamp,
821 'duration': duration,
822 'formats': formats,
823 'subtitles': subtitles,
824 })
825 else:
826 # data-playable without vpid but with a playlist.sxml URLs
827 # in otherSettings.playlist (e.g.
828 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
829 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
830 if playlist:
a7e5f274
RA
831 entry = None
832 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
833 playlist_url = playlist.get('%sUrl' % key)
834 if not playlist_url:
835 continue
836 try:
a7e5f274
RA
837 info = self._extract_from_playlist_sxml(
838 playlist_url, playlist_id, timestamp)
839 if not entry:
840 entry = info
841 else:
842 entry['title'] = info['title']
843 entry['formats'].extend(info['formats'])
05087d1b
S
844 except Exception as e:
845 # Some playlist URL may fail with 500, at the same time
846 # the other one may work fine (e.g.
847 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
848 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
849 continue
850 raise
a7e5f274
RA
851 if entry:
852 self._sort_formats(entry['formats'])
853 entries.append(entry)
78f9d843
S
854
855 if entries:
78f9d843
S
856 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
857
858 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
859 programme_id = self._search_regex(
a1cf3e38 860 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
861 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
862 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 863 webpage, 'vpid', default=None)
dab062fb 864
9afa1770
S
865 if programme_id:
866 formats, subtitles = self._download_media_selector(programme_id)
867 self._sort_formats(formats)
868 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
869 digital_data = self._parse_json(
870 self._search_regex(
871 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
872 programme_id, fatal=False)
873 page_info = digital_data.get('page', {}).get('pageInfo', {})
874 title = page_info.get('pageName') or self._og_search_title(webpage)
875 description = page_info.get('description') or self._og_search_description(webpage)
876 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
877 return {
878 'id': programme_id,
879 'title': title,
880 'description': description,
881 'timestamp': timestamp,
882 'formats': formats,
883 'subtitles': subtitles,
884 }
a3bfddfa 885
9fb64c04
S
886 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
887 # There are several setPayload calls may be present but the video
888 # seems to be always related to the first one
889 morph_payload = self._parse_json(
890 self._search_regex(
891 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
892 webpage, 'morph payload', default='{}'),
893 playlist_id, fatal=False)
894 if morph_payload:
895 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
896 for component in components:
897 if not isinstance(component, dict):
898 continue
899 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
900 if not lead_media:
901 continue
902 identifiers = lead_media.get('identifiers')
903 if not identifiers or not isinstance(identifiers, dict):
904 continue
905 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
906 if not programme_id:
907 continue
908 title = lead_media.get('title') or self._og_search_title(webpage)
909 formats, subtitles = self._download_media_selector(programme_id)
910 self._sort_formats(formats)
911 description = lead_media.get('summary')
912 uploader = lead_media.get('masterBrand')
913 uploader_id = lead_media.get('mid')
914 duration = None
915 duration_d = lead_media.get('duration')
916 if isinstance(duration_d, dict):
917 duration = parse_duration(dict_get(
918 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
919 return {
920 'id': programme_id,
921 'title': title,
922 'description': description,
923 'duration': duration,
924 'uploader': uploader,
925 'uploader_id': uploader_id,
926 'formats': formats,
927 'subtitles': subtitles,
928 }
929
88ed52ae
S
930 def extract_all(pattern):
931 return list(filter(None, map(
932 lambda s: self._parse_json(s, playlist_id, fatal=False),
933 re.findall(pattern, webpage))))
934
935 # Multiple video article (e.g.
936 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 937 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
938 entries = []
939 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
940 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
941 if embed_url and re.match(EMBED_URL, embed_url):
942 entries.append(embed_url)
943 entries.extend(re.findall(
944 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
945 if entries:
946 return self.playlist_result(
aaa42cf0 947 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 948 playlist_id, playlist_title, playlist_description)
9afa1770
S
949
950 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 951 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
952
953 if not medias:
954 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
955 media_asset = self._search_regex(
956 r'mediaAssetPage\.init\(\s*({.+?}), "/',
957 webpage, 'media asset', default=None)
958 if media_asset:
959 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
960 medias = []
961 for video in media_asset_page.get('videos', {}).values():
962 medias.extend(video.values())
963
964 if not medias:
965 # Multiple video playlist with single `now playing` entry (e.g.
966 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
967 vxp_playlist = self._parse_json(
9afa1770 968 self._search_regex(
a346b1ff
S
969 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
970 webpage, 'playlist data'),
9afa1770 971 playlist_id)
a346b1ff
S
972 playlist_medias = []
973 for item in vxp_playlist:
974 media = item.get('media')
975 if not media:
976 continue
977 playlist_medias.append(media)
978 # Download single video if found media with asset id matching the video id from URL
979 if item.get('advert', {}).get('assetId') == playlist_id:
980 medias = [media]
981 break
982 # Fallback to the whole playlist
983 if not medias:
984 medias = playlist_medias
9afa1770
S
985
986 entries = []
987 for num, media_meta in enumerate(medias, start=1):
988 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
989 if not formats:
990 continue
10273d6e 991 self._sort_formats(formats)
992
9afa1770
S
993 video_id = media_meta.get('externalId')
994 if not video_id:
995 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
996
997 title = media_meta.get('caption')
998 if not title:
999 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1000
1001 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1002
9afa1770
S
1003 images = []
1004 for image in media_meta.get('images', {}).values():
1005 images.extend(image.values())
1006 if 'image' in media_meta:
1007 images.append(media_meta['image'])
1008
1009 thumbnails = [{
1010 'url': image.get('href'),
1011 'width': int_or_none(image.get('width')),
1012 'height': int_or_none(image.get('height')),
1013 } for image in images]
1014
1015 entries.append({
1016 'id': video_id,
10273d6e 1017 'title': title,
9afa1770 1018 'thumbnails': thumbnails,
10273d6e 1019 'duration': duration,
9afa1770 1020 'timestamp': timestamp,
10273d6e 1021 'formats': formats,
1022 'subtitles': subtitles,
a3bfddfa 1023 })
10273d6e 1024
9afa1770 1025 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1026
1027
1028class BBCCoUkArticleIE(InfoExtractor):
5886b38d 1029 _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1030 IE_NAME = 'bbc.co.uk:article'
1031 IE_DESC = 'BBC articles'
1032
1033 _TEST = {
1034 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1035 'info_dict': {
1036 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1037 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1038 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1039 },
1040 'playlist_count': 4,
1041 'add_ie': ['BBCCoUk'],
1042 }
1043
1044 def _real_extract(self, url):
1045 playlist_id = self._match_id(url)
1046
1047 webpage = self._download_webpage(url, playlist_id)
1048
1049 title = self._og_search_title(webpage)
1050 description = self._og_search_description(webpage).strip()
1051
1052 entries = [self.url_result(programme_url) for programme_url in re.findall(
1053 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1054
1055 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1056
1057
1058class BBCCoUkPlaylistBaseIE(InfoExtractor):
1059 def _real_extract(self, url):
1060 playlist_id = self._match_id(url)
1061
1062 webpage = self._download_webpage(url, playlist_id)
1063
1064 entries = [
1065 self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1066 for video_id in re.findall(
1067 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)]
1068
1069 title, description = self._extract_title_and_description(webpage)
1070
1071 return self.playlist_result(entries, playlist_id, title, description)
1072
1073
1074class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1075 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1076 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1077 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1078 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1079 _TESTS = [{
ded7511a
S
1080 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1081 'info_dict': {
1082 'id': 'b05rcz9v',
1083 'title': 'The Disappearance',
1084 'description': 'French thriller serial about a missing teenager.',
1085 },
1086 'playlist_mincount': 6,
c6668e4a 1087 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1088 }, {
1089 # Available for over a year unlike 30 days for most other programmes
1090 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1091 'info_dict': {
1092 'id': 'p02tcc32',
1093 'title': 'Bohemian Icons',
1094 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1095 },
1096 'playlist_mincount': 10,
1097 }]
ded7511a
S
1098
1099 def _extract_title_and_description(self, webpage):
1100 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1101 description = self._search_regex(
1102 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1103 webpage, 'description', fatal=False, group='value')
1104 return title, description
1105
1106
1107class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1108 IE_NAME = 'bbc.co.uk:playlist'
1109 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1110 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1111 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1112 _TESTS = [{
1113 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1114 'info_dict': {
1115 'id': 'b05rcz9v',
1116 'title': 'The Disappearance - Clips - BBC Four',
1117 'description': 'French thriller serial about a missing teenager.',
1118 },
1119 'playlist_mincount': 7,
1120 }, {
1121 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1122 'only_matching': True,
1123 }, {
1124 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1125 'only_matching': True,
1126 }, {
1127 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1128 'only_matching': True,
1129 }]
1130
1131 def _extract_title_and_description(self, webpage):
1132 title = self._og_search_title(webpage, fatal=False)
1133 description = self._og_search_description(webpage)
1134 return title, description