]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[bbc] Add support for morph embeds (Closes #10239)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
082c6c86 5
f13b1e7d 6from .common import InfoExtractor
8683b4d8 7from ..utils import (
9fb64c04 8 dict_get,
8683b4d8 9 ExtractorError,
9afa1770 10 float_or_none,
8683b4d8 11 int_or_none,
9afa1770
S
12 parse_duration,
13 parse_iso8601,
9fb64c04 14 try_get,
dab062fb 15 unescapeHTML,
8683b4d8 16)
36e6f62c
JMF
17from ..compat import (
18 compat_etree_fromstring,
19 compat_HTTPError,
20)
082c6c86 21
d12a1a47 22
f13b1e7d 23class BBCCoUkIE(InfoExtractor):
082c6c86 24 IE_NAME = 'bbc.co.uk'
2e3fd9ec 25 IE_DESC = 'BBC iPlayer'
22d7368d 26 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
27 _VALID_URL = r'''(?x)
28 https?://
29 (?:www\.)?bbc\.co\.uk/
30 (?:
31 programmes/(?!articles/)|
32 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
33 music/clips[/#]|
34 radio/player/
35 )
ded7511a 36 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 37 ''' % _ID_REGEX
082c6c86 38
d12a1a47 39 _MEDIASELECTOR_URLS = [
26ccc68b
S
40 # Provides HQ HLS streams with even better quality that pc mediaset but fails
41 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 42 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 43 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
44 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
45 ]
a8b081a0 46
e6174ee9
S
47 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
48 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
49
50 _NAMESPACES = (
51 _MEDIASELECTION_NS,
52 _EMP_PLAYLIST_NS,
53 )
54
2e3fd9ec
S
55 _TESTS = [
56 {
f2d0fc68 57 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 58 'info_dict': {
f2d0fc68 59 'id': 'b039d07m',
b1ea6802 60 'ext': 'flv',
679bacf0 61 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 62 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
63 },
64 'params': {
b1ea6802 65 # rtmp download
2e3fd9ec
S
66 'skip_download': True,
67 }
082c6c86 68 },
2e3fd9ec
S
69 {
70 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
71 'info_dict': {
72 'id': 'b00yng1d',
73 'ext': 'flv',
74 'title': 'The Man in Black: Series 3: The Printed Name',
75 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
76 'duration': 1800,
77 },
78 'params': {
79 # rtmp download
80 'skip_download': True,
c7f0177f
S
81 },
82 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
83 },
84 {
85 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
86 'info_dict': {
87 'id': 'b00yng1d',
88 'ext': 'flv',
17968e44 89 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 90 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 91 'duration': 5100,
2e3fd9ec
S
92 },
93 'params': {
94 # rtmp download
95 'skip_download': True,
96 },
b1ea6802 97 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
98 },
99 {
100 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
101 'info_dict': {
102 'id': 'b03k3pb7',
103 'ext': 'flv',
104 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
105 'description': '2. Invasion',
106 'duration': 3600,
107 },
108 'params': {
109 # rtmp download
110 'skip_download': True,
111 },
b1ea6802 112 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
113 }, {
114 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
115 'info_dict': {
116 'id': 'b04v209v',
117 'ext': 'flv',
118 'title': 'Pete Tong, The Essential New Tune Special',
119 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
120 'duration': 10800,
121 },
122 'params': {
123 # rtmp download
124 'skip_download': True,
a3ef0e1c
YCH
125 },
126 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 127 }, {
5aa535c3 128 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
129 'note': 'Audio',
130 'info_dict': {
5aa535c3 131 'id': 'p022h44j',
b1ea6802 132 'ext': 'flv',
5aa535c3
S
133 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
134 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
135 'duration': 227,
c7e67594
S
136 },
137 'params': {
b1ea6802 138 # rtmp download
c7e67594
S
139 'skip_download': True,
140 }
141 }, {
142 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
143 'note': 'Video',
144 'info_dict': {
145 'id': 'p025c103',
b1ea6802 146 'ext': 'flv',
c7e67594
S
147 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
148 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
149 'duration': 226,
150 },
151 'params': {
b1ea6802 152 # rtmp download
c7e67594
S
153 'skip_download': True,
154 }
e68ae99a
S
155 }, {
156 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
157 'info_dict': {
158 'id': 'p02n76xf',
159 'ext': 'flv',
160 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
161 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
162 'duration': 3540,
163 },
164 'params': {
165 # rtmp download
166 'skip_download': True,
167 },
b1ea6802 168 'skip': 'geolocation',
25fa8d66
YCH
169 }, {
170 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
171 'info_dict': {
172 'id': 'b05zmgw1',
173 'ext': 'flv',
174 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
175 'title': 'Royal Academy Summer Exhibition',
176 'duration': 3540,
177 },
178 'params': {
179 # rtmp download
180 'skip_download': True,
181 },
b1ea6802 182 'skip': 'geolocation',
54914380
S
183 }, {
184 # iptv-all mediaset fails with geolocation however there is no geo restriction
185 # for this programme at all
5aa535c3 186 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 187 'info_dict': {
5aa535c3 188 'id': 'b06rkms3',
54914380 189 'ext': 'flv',
5aa535c3
S
190 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
191 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
192 },
193 'params': {
194 # rtmp download
195 'skip_download': True,
196 },
b1ea6802 197 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
198 }, {
199 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
200 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
201 'info_dict': {
202 'id': 'p028bfkj',
b1ea6802 203 'ext': 'flv',
1ac6e794
S
204 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
205 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
206 },
207 'params': {
b1ea6802 208 # rtmp download
1ac6e794
S
209 'skip_download': True,
210 },
31763975
S
211 }, {
212 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
213 'only_matching': True,
c7e67594
S
214 }, {
215 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
216 'only_matching': True,
0692ef86
S
217 }, {
218 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
219 'only_matching': True,
f20a11ed
S
220 }, {
221 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
222 'only_matching': True,
ae6986fb 223 }
2e3fd9ec
S
224 ]
225
d12a1a47
S
226 class MediaSelectionError(Exception):
227 def __init__(self, id):
228 self.id = id
229
2e3fd9ec
S
230 def _extract_asx_playlist(self, connection, programme_id):
231 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
232 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
233
2e3fd9ec 234 def _extract_items(self, playlist):
e6174ee9
S
235 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
236
237 def _findall_ns(self, element, xpath):
238 elements = []
239 for ns in self._NAMESPACES:
240 elements.extend(element.findall(xpath % ns))
241 return elements
2e3fd9ec
S
242
243 def _extract_medias(self, media_selection):
e6174ee9
S
244 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
245 if error is None:
246 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 247 if error is not None:
d12a1a47 248 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 249 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
250
251 def _extract_connections(self, media):
e6174ee9 252 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 253
f13b1e7d 254 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
255 subtitles = {}
256 for connection in self._extract_connections(media):
257 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
258 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
259 subtitles[lang] = [
260 {
261 'url': connection.get('href'),
262 'ext': 'ttml',
263 },
f13b1e7d 264 ]
2e3fd9ec 265 return subtitles
082c6c86 266
d12a1a47
S
267 def _raise_extractor_error(self, media_selection_error):
268 raise ExtractorError(
269 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
270 expected=True)
271
c056efa2 272 def _download_media_selector(self, programme_id):
d12a1a47
S
273 last_exception = None
274 for mediaselector_url in self._MEDIASELECTOR_URLS:
275 try:
276 return self._download_media_selector_url(
277 mediaselector_url % programme_id, programme_id)
278 except BBCCoUkIE.MediaSelectionError as e:
d781e293 279 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
280 last_exception = e
281 continue
282 self._raise_extractor_error(e)
283 self._raise_extractor_error(last_exception)
9afa1770
S
284
285 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
286 try:
287 media_selection = self._download_xml(
9afa1770 288 url, programme_id, 'Downloading media selection XML')
c056efa2 289 except ExtractorError as ee:
d781e293 290 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 291 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 292 else:
c056efa2 293 raise
9afa1770 294 return self._process_media_selector(media_selection, programme_id)
082c6c86 295
9afa1770 296 def _process_media_selector(self, media_selection, programme_id):
082c6c86 297 formats = []
2e3fd9ec 298 subtitles = None
b0af1215 299 urls = []
2e3fd9ec 300
c056efa2
S
301 for media in self._extract_medias(media_selection):
302 kind = media.get('kind')
a7e5f274
RA
303 if kind in ('video', 'audio'):
304 bitrate = int_or_none(media.get('bitrate'))
305 encoding = media.get('encoding')
306 service = media.get('service')
307 width = int_or_none(media.get('width'))
308 height = int_or_none(media.get('height'))
309 file_size = int_or_none(media.get('media_file_size'))
310 for connection in self._extract_connections(media):
b0af1215
RA
311 href = connection.get('href')
312 if href in urls:
313 continue
314 if href:
315 urls.append(href)
a7e5f274
RA
316 conn_kind = connection.get('kind')
317 protocol = connection.get('protocol')
318 supplier = connection.get('supplier')
a7e5f274
RA
319 transfer_format = connection.get('transferFormat')
320 format_id = supplier or conn_kind or protocol
321 if service:
322 format_id = '%s_%s' % (service, format_id)
323 # ASX playlist
324 if supplier == 'asx':
325 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
326 formats.append({
327 'url': ref,
328 'format_id': 'ref%s_%s' % (i, format_id),
329 })
330 elif transfer_format == 'dash':
331 formats.extend(self._extract_mpd_formats(
332 href, programme_id, mpd_id=format_id, fatal=False))
333 elif transfer_format == 'hls':
334 formats.extend(self._extract_m3u8_formats(
335 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
336 m3u8_id=format_id, fatal=False))
337 elif transfer_format == 'hds':
338 formats.extend(self._extract_f4m_formats(
339 href, programme_id, f4m_id=format_id, fatal=False))
340 else:
f9622868 341 if not service and not supplier and bitrate:
b0af1215 342 format_id += '-%d' % bitrate
a7e5f274
RA
343 fmt = {
344 'format_id': format_id,
345 'filesize': file_size,
346 }
347 if kind == 'video':
348 fmt.update({
349 'width': width,
350 'height': height,
351 'vbr': bitrate,
352 'vcodec': encoding,
353 })
354 else:
355 fmt.update({
356 'abr': bitrate,
357 'acodec': encoding,
358 'vcodec': 'none',
359 })
360 if protocol == 'http':
361 # Direct link
362 fmt.update({
363 'url': href,
364 })
365 elif protocol == 'rtmp':
366 application = connection.get('application', 'ondemand')
367 auth_string = connection.get('authString')
368 identifier = connection.get('identifier')
369 server = connection.get('server')
370 fmt.update({
371 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
372 'play_path': identifier,
373 'app': '%s?%s' % (application, auth_string),
374 'page_url': 'http://www.bbc.co.uk',
375 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
376 'rtmp_live': False,
377 'ext': 'flv',
378 })
379 formats.append(fmt)
c056efa2 380 elif kind == 'captions':
f13b1e7d 381 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 382 return formats, subtitles
2e3fd9ec 383
ae6986fb
S
384 def _download_playlist(self, playlist_id):
385 try:
386 playlist = self._download_json(
387 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
388 playlist_id, 'Downloading playlist JSON')
389
390 version = playlist.get('defaultAvailableVersion')
391 if version:
392 smp_config = version['smpConfig']
393 title = smp_config['title']
394 description = smp_config['summary']
395 for item in smp_config['items']:
396 kind = item['kind']
397 if kind != 'programme' and kind != 'radioProgramme':
398 continue
399 programme_id = item.get('vpid')
d97f5cd7 400 duration = int_or_none(item.get('duration'))
ae6986fb
S
401 formats, subtitles = self._download_media_selector(programme_id)
402 return programme_id, title, description, duration, formats, subtitles
403 except ExtractorError as ee:
f813928e 404 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
405 raise
406
407 # fallback to legacy playlist
9afa1770
S
408 return self._process_legacy_playlist(playlist_id)
409
410 def _process_legacy_playlist_url(self, url, display_id):
411 playlist = self._download_legacy_playlist_url(url, display_id)
412 return self._extract_from_legacy_playlist(playlist, display_id)
413
414 def _process_legacy_playlist(self, playlist_id):
415 return self._process_legacy_playlist_url(
416 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
417
418 def _download_legacy_playlist_url(self, url, playlist_id=None):
419 return self._download_xml(
420 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 421
9afa1770 422 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 423 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
424 if no_items is not None:
425 reason = no_items.get('reason')
426 if reason == 'preAvailability':
427 msg = 'Episode %s is not yet available' % playlist_id
428 elif reason == 'postAvailability':
429 msg = 'Episode %s is no longer available' % playlist_id
430 elif reason == 'noMedia':
431 msg = 'Episode %s is not currently available' % playlist_id
432 else:
433 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
434 raise ExtractorError(msg, expected=True)
435
436 for item in self._extract_items(playlist):
437 kind = item.get('kind')
438 if kind != 'programme' and kind != 'radioProgramme':
439 continue
e6174ee9
S
440 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
441 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 442 description = description_el.text if description_el is not None else None
9afa1770
S
443
444 def get_programme_id(item):
445 def get_from_attributes(item):
446 for p in('identifier', 'group'):
447 value = item.get(p)
448 if value and re.match(r'^[pb][\da-z]{7}$', value):
449 return value
450 get_from_attributes(item)
e6174ee9 451 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
452 if mediator is not None:
453 return get_from_attributes(mediator)
454
455 programme_id = get_programme_id(item)
d97f5cd7 456 duration = int_or_none(item.get('duration'))
e6174ee9
S
457
458 if programme_id:
459 formats, subtitles = self._download_media_selector(programme_id)
460 else:
461 formats, subtitles = self._process_media_selector(item, playlist_id)
462 programme_id = playlist_id
ae6986fb
S
463
464 return programme_id, title, description, duration, formats, subtitles
465
c056efa2
S
466 def _real_extract(self, url):
467 group_id = self._match_id(url)
468
469 webpage = self._download_webpage(url, group_id, 'Downloading video page')
470
8683b4d8 471 programme_id = None
679bacf0 472 duration = None
8683b4d8
S
473
474 tviplayer = self._search_regex(
475 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
476 webpage, 'player', default=None)
477
478 if tviplayer:
479 player = self._parse_json(tviplayer, group_id).get('player', {})
480 duration = int_or_none(player.get('duration'))
481 programme_id = player.get('vpid')
482
483 if not programme_id:
484 programme_id = self._search_regex(
22d7368d 485 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 486
c056efa2 487 if programme_id:
c056efa2 488 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 489 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
490 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
491 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 492 description = self._search_regex(
a8534274
S
493 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
494 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
495 webpage, 'description', default=None)
496 if not description:
497 description = self._html_search_meta('description', webpage)
c056efa2 498 else:
ae6986fb 499 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 500
082c6c86
S
501 self._sort_formats(formats)
502
503 return {
2e3fd9ec 504 'id': programme_id,
082c6c86
S
505 'title': title,
506 'description': description,
650cfd0c 507 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
508 'duration': duration,
509 'formats': formats,
2e3fd9ec 510 'subtitles': subtitles,
5f6a1245 511 }
10273d6e 512
513
9afa1770
S
514class BBCIE(BBCCoUkIE):
515 IE_NAME = 'bbc'
516 IE_DESC = 'BBC'
517 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 518
d12a1a47 519 _MEDIASELECTOR_URLS = [
55ebae26
S
520 # Provides HQ HLS streams but fails with geolocation in some cases when it's
521 # even not geo restricted at all
522 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
523 # Provides more formats, namely direct mp4 links, but fails on some videos with
524 # notukerror for non UK (?) users (e.g.
525 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
526 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
527 # Provides fewer formats, but works everywhere for everybody (hopefully)
528 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
529 ]
10273d6e 530
531 _TESTS = [{
6a747190 532 # article with multiple videos embedded with data-playable containing vpids
10273d6e 533 'url': 'http://www.bbc.com/news/world-europe-32668511',
534 'info_dict': {
535 'id': 'world-europe-32668511',
536 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 537 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 538 },
539 'playlist_count': 2,
a3bfddfa 540 }, {
6a747190 541 # article with multiple videos embedded with data-playable (more videos)
10273d6e 542 'url': 'http://www.bbc.com/news/business-28299555',
543 'info_dict': {
544 'id': 'business-28299555',
545 'title': 'Farnborough Airshow: Video highlights',
9afa1770 546 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 547 },
548 'playlist_count': 9,
9afa1770 549 'skip': 'Save time',
88ed52ae
S
550 }, {
551 # article with multiple videos embedded with `new SMP()`
6a747190 552 # broken
88ed52ae
S
553 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
554 'info_dict': {
555 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 556 'title': 'BUGGER',
88ed52ae
S
557 },
558 'playlist_count': 18,
a3bfddfa 559 }, {
6a747190 560 # single video embedded with data-playable containing vpid
10273d6e 561 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 562 'info_dict': {
563 'id': 'p02mprgb',
55ebae26 564 'ext': 'mp4',
10273d6e 565 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 566 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 567 'duration': 47,
9afa1770 568 'timestamp': 1427219242,
da92eeae 569 'upload_date': '20150324',
10273d6e 570 },
571 'params': {
9afa1770 572 # rtmp download
10273d6e 573 'skip_download': True,
574 }
a3bfddfa 575 }, {
6a747190
S
576 # article with single video embedded with data-playable containing XML playlist
577 # with direct video links as progressiveDownloadUrl (for now these are extracted)
578 # and playlist with f4m and m3u8 as streamingUrl
de939d89 579 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 580 'info_dict': {
9afa1770 581 'id': '150615_telabyad_kentin_cogu',
de939d89 582 'ext': 'mp4',
ad152e2d 583 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 584 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 585 'timestamp': 1434397334,
da92eeae 586 'upload_date': '20150615',
de939d89 587 },
588 'params': {
589 'skip_download': True,
590 }
c936d8cc 591 }, {
6a747190 592 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 593 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 594 'info_dict': {
9afa1770 595 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 596 'ext': 'mp4',
9afa1770 597 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 598 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 599 'timestamp': 1434713142,
da92eeae 600 'upload_date': '20150619',
de939d89 601 },
602 'params': {
603 'skip_download': True,
604 }
a346b1ff
S
605 }, {
606 # single video from video playlist embedded with vxp-playlist-data JSON
607 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
608 'info_dict': {
609 'id': 'p02w6qjc',
55ebae26 610 'ext': 'mp4',
a346b1ff
S
611 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
612 'duration': 56,
0bc4ee60 613 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
614 },
615 'params': {
616 'skip_download': True,
617 }
9afa1770
S
618 }, {
619 # single video story with digitalData
620 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
621 'info_dict': {
622 'id': 'p02q6gc4',
623 'ext': 'flv',
624 'title': 'Sri Lanka’s spicy secret',
625 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
626 'timestamp': 1437674293,
627 'upload_date': '20150723',
628 },
629 'params': {
630 # rtmp download
631 'skip_download': True,
632 }
633 }, {
634 # single video story without digitalData
635 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
636 'info_dict': {
637 'id': 'p018zqqg',
55ebae26 638 'ext': 'mp4',
9afa1770
S
639 'title': 'Hyundai Santa Fe Sport: Rock star',
640 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
641 'timestamp': 1415867444,
642 'upload_date': '20141113',
9afa1770
S
643 },
644 'params': {
645 # rtmp download
646 'skip_download': True,
647 }
9fb64c04
S
648 }, {
649 # single video embedded with Morph
650 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
651 'info_dict': {
652 'id': 'p041vhd0',
653 'ext': 'mp4',
654 'title': "Nigeria v Japan - Men's First Round",
655 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
656 'duration': 7980,
657 'uploader': 'BBC Sport',
658 'uploader_id': 'bbc_sport',
659 },
660 'params': {
661 # m3u8 download
662 'skip_download': True,
663 'proxy': '5.101.173.158:8080',
664 },
665 'skip': 'Georestricted to UK',
9afa1770 666 }, {
6a747190 667 # single video with playlist.sxml URL in playlist param
9afa1770
S
668 'url': 'http://www.bbc.com/sport/0/football/33653409',
669 'info_dict': {
670 'id': 'p02xycnp',
55ebae26 671 'ext': 'mp4',
9afa1770 672 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 673 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
674 'duration': 140,
675 },
676 'params': {
677 # rtmp download
678 'skip_download': True,
679 }
b5d48cb1 680 }, {
6a747190 681 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
682 'url': 'http://www.bbc.com/sport/0/football/34475836',
683 'info_dict': {
684 'id': '34475836',
450b233c 685 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 686 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
687 },
688 'playlist_count': 3,
450b233c
S
689 }, {
690 # school report article with single video
691 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
692 'info_dict': {
693 'id': '35744779',
694 'title': 'School which breaks down barriers in Jerusalem',
695 },
696 'playlist_count': 1,
9afa1770
S
697 }, {
698 # single video with playlist URL from weather section
699 'url': 'http://www.bbc.com/weather/features/33601775',
700 'only_matching': True,
701 }, {
702 # custom redirection to www.bbc.com
703 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
704 'only_matching': True,
a1cf3e38
S
705 }, {
706 # single video article embedded with data-media-vpid
707 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
708 'only_matching': True,
10273d6e 709 }]
710
9afa1770
S
711 @classmethod
712 def suitable(cls, url):
ded7511a
S
713 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
714 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
715 else super(BBCIE, cls).suitable(url))
9afa1770
S
716
717 def _extract_from_media_meta(self, media_meta, video_id):
718 # Direct links to media in media metadata (e.g.
719 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
720 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
721 source_files = media_meta.get('sourceFiles')
722 if source_files:
723 return [{
724 'url': f['url'],
725 'format_id': format_id,
726 'ext': f.get('encoding'),
727 'tbr': float_or_none(f.get('bitrate'), 1000),
728 'filesize': int_or_none(f.get('filesize')),
729 } for format_id, f in source_files.items() if f.get('url')], []
730
731 programme_id = media_meta.get('externalId')
732 if programme_id:
733 return self._download_media_selector(programme_id)
734
735 # Process playlist.sxml as legacy playlist
736 href = media_meta.get('href')
737 if href:
738 playlist = self._download_legacy_playlist_url(href)
739 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
740 return formats, subtitles
741
742 return [], []
743
baf39a1a
S
744 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
745 programme_id, title, description, duration, formats, subtitles = \
746 self._process_legacy_playlist_url(url, playlist_id)
747 self._sort_formats(formats)
748 return {
749 'id': programme_id,
750 'title': title,
751 'description': description,
752 'duration': duration,
753 'timestamp': timestamp,
754 'formats': formats,
755 'subtitles': subtitles,
756 }
757
10273d6e 758 def _real_extract(self, url):
9afa1770
S
759 playlist_id = self._match_id(url)
760
761 webpage = self._download_webpage(url, playlist_id)
762
350e02d4
YCH
763 json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
764 timestamp = json_ld_info.get('timestamp')
0e832c2c 765
350e02d4 766 playlist_title = json_ld_info.get('title')
0e832c2c
S
767 if not playlist_title:
768 playlist_title = self._og_search_title(
769 webpage, default=None) or self._html_search_regex(
770 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
771 if playlist_title:
772 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
773
774 playlist_description = json_ld_info.get(
775 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
776
777 if not timestamp:
778 timestamp = parse_iso8601(self._search_regex(
779 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
780 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 781 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 782 webpage, 'date', default=None))
9afa1770 783
78f9d843
S
784 entries = []
785
de665713
S
786 # article with multiple videos embedded with playlist.sxml (e.g.
787 # http://www.bbc.com/sport/0/football/34475836)
788 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 789 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 790 if playlists:
baf39a1a
S
791 entries = [
792 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
793 for playlist_url in playlists]
de939d89 794
78f9d843
S
795 # news article with multiple videos embedded with data-playable
796 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
797 if data_playables:
798 for _, data_playable_json in data_playables:
799 data_playable = self._parse_json(
800 unescapeHTML(data_playable_json), playlist_id, fatal=False)
801 if not data_playable:
802 continue
baf39a1a
S
803 settings = data_playable.get('settings', {})
804 if settings:
78f9d843
S
805 # data-playable with video vpid in settings.playlistObject.items (e.g.
806 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
807 playlist_object = settings.get('playlistObject', {})
808 if playlist_object:
809 items = playlist_object.get('items')
810 if items and isinstance(items, list):
78f9d843
S
811 title = playlist_object['title']
812 description = playlist_object.get('summary')
baf39a1a
S
813 duration = int_or_none(items[0].get('duration'))
814 programme_id = items[0].get('vpid')
78f9d843
S
815 formats, subtitles = self._download_media_selector(programme_id)
816 self._sort_formats(formats)
817 entries.append({
818 'id': programme_id,
819 'title': title,
820 'description': description,
821 'timestamp': timestamp,
822 'duration': duration,
823 'formats': formats,
824 'subtitles': subtitles,
825 })
826 else:
827 # data-playable without vpid but with a playlist.sxml URLs
828 # in otherSettings.playlist (e.g.
829 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
830 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
831 if playlist:
a7e5f274
RA
832 entry = None
833 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
834 playlist_url = playlist.get('%sUrl' % key)
835 if not playlist_url:
836 continue
837 try:
a7e5f274
RA
838 info = self._extract_from_playlist_sxml(
839 playlist_url, playlist_id, timestamp)
840 if not entry:
841 entry = info
842 else:
843 entry['title'] = info['title']
844 entry['formats'].extend(info['formats'])
05087d1b
S
845 except Exception as e:
846 # Some playlist URL may fail with 500, at the same time
847 # the other one may work fine (e.g.
848 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
849 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
850 continue
851 raise
a7e5f274
RA
852 if entry:
853 self._sort_formats(entry['formats'])
854 entries.append(entry)
78f9d843
S
855
856 if entries:
78f9d843
S
857 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
858
859 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
860 programme_id = self._search_regex(
a1cf3e38 861 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
862 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
863 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 864 webpage, 'vpid', default=None)
dab062fb 865
9afa1770
S
866 if programme_id:
867 formats, subtitles = self._download_media_selector(programme_id)
868 self._sort_formats(formats)
869 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
870 digital_data = self._parse_json(
871 self._search_regex(
872 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
873 programme_id, fatal=False)
874 page_info = digital_data.get('page', {}).get('pageInfo', {})
875 title = page_info.get('pageName') or self._og_search_title(webpage)
876 description = page_info.get('description') or self._og_search_description(webpage)
877 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
878 return {
879 'id': programme_id,
880 'title': title,
881 'description': description,
882 'timestamp': timestamp,
883 'formats': formats,
884 'subtitles': subtitles,
885 }
a3bfddfa 886
9fb64c04
S
887 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
888 # There are several setPayload calls may be present but the video
889 # seems to be always related to the first one
890 morph_payload = self._parse_json(
891 self._search_regex(
892 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
893 webpage, 'morph payload', default='{}'),
894 playlist_id, fatal=False)
895 if morph_payload:
896 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
897 for component in components:
898 if not isinstance(component, dict):
899 continue
900 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
901 if not lead_media:
902 continue
903 identifiers = lead_media.get('identifiers')
904 if not identifiers or not isinstance(identifiers, dict):
905 continue
906 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
907 if not programme_id:
908 continue
909 title = lead_media.get('title') or self._og_search_title(webpage)
910 formats, subtitles = self._download_media_selector(programme_id)
911 self._sort_formats(formats)
912 description = lead_media.get('summary')
913 uploader = lead_media.get('masterBrand')
914 uploader_id = lead_media.get('mid')
915 duration = None
916 duration_d = lead_media.get('duration')
917 if isinstance(duration_d, dict):
918 duration = parse_duration(dict_get(
919 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
920 return {
921 'id': programme_id,
922 'title': title,
923 'description': description,
924 'duration': duration,
925 'uploader': uploader,
926 'uploader_id': uploader_id,
927 'formats': formats,
928 'subtitles': subtitles,
929 }
930
88ed52ae
S
931 def extract_all(pattern):
932 return list(filter(None, map(
933 lambda s: self._parse_json(s, playlist_id, fatal=False),
934 re.findall(pattern, webpage))))
935
936 # Multiple video article (e.g.
937 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 938 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
939 entries = []
940 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
941 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
942 if embed_url and re.match(EMBED_URL, embed_url):
943 entries.append(embed_url)
944 entries.extend(re.findall(
945 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
946 if entries:
947 return self.playlist_result(
948 [self.url_result(entry, 'BBCCoUk') for entry in entries],
949 playlist_id, playlist_title, playlist_description)
9afa1770
S
950
951 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 952 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
953
954 if not medias:
955 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
956 media_asset = self._search_regex(
957 r'mediaAssetPage\.init\(\s*({.+?}), "/',
958 webpage, 'media asset', default=None)
959 if media_asset:
960 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
961 medias = []
962 for video in media_asset_page.get('videos', {}).values():
963 medias.extend(video.values())
964
965 if not medias:
966 # Multiple video playlist with single `now playing` entry (e.g.
967 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
968 vxp_playlist = self._parse_json(
9afa1770 969 self._search_regex(
a346b1ff
S
970 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
971 webpage, 'playlist data'),
9afa1770 972 playlist_id)
a346b1ff
S
973 playlist_medias = []
974 for item in vxp_playlist:
975 media = item.get('media')
976 if not media:
977 continue
978 playlist_medias.append(media)
979 # Download single video if found media with asset id matching the video id from URL
980 if item.get('advert', {}).get('assetId') == playlist_id:
981 medias = [media]
982 break
983 # Fallback to the whole playlist
984 if not medias:
985 medias = playlist_medias
9afa1770
S
986
987 entries = []
988 for num, media_meta in enumerate(medias, start=1):
989 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
990 if not formats:
991 continue
10273d6e 992 self._sort_formats(formats)
993
9afa1770
S
994 video_id = media_meta.get('externalId')
995 if not video_id:
996 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
997
998 title = media_meta.get('caption')
999 if not title:
1000 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1001
1002 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1003
9afa1770
S
1004 images = []
1005 for image in media_meta.get('images', {}).values():
1006 images.extend(image.values())
1007 if 'image' in media_meta:
1008 images.append(media_meta['image'])
1009
1010 thumbnails = [{
1011 'url': image.get('href'),
1012 'width': int_or_none(image.get('width')),
1013 'height': int_or_none(image.get('height')),
1014 } for image in images]
1015
1016 entries.append({
1017 'id': video_id,
10273d6e 1018 'title': title,
9afa1770 1019 'thumbnails': thumbnails,
10273d6e 1020 'duration': duration,
9afa1770 1021 'timestamp': timestamp,
10273d6e 1022 'formats': formats,
1023 'subtitles': subtitles,
a3bfddfa 1024 })
10273d6e 1025
9afa1770 1026 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1027
1028
1029class BBCCoUkArticleIE(InfoExtractor):
5886b38d 1030 _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1031 IE_NAME = 'bbc.co.uk:article'
1032 IE_DESC = 'BBC articles'
1033
1034 _TEST = {
1035 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1036 'info_dict': {
1037 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1038 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1039 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1040 },
1041 'playlist_count': 4,
1042 'add_ie': ['BBCCoUk'],
1043 }
1044
1045 def _real_extract(self, url):
1046 playlist_id = self._match_id(url)
1047
1048 webpage = self._download_webpage(url, playlist_id)
1049
1050 title = self._og_search_title(webpage)
1051 description = self._og_search_description(webpage).strip()
1052
1053 entries = [self.url_result(programme_url) for programme_url in re.findall(
1054 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1055
1056 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1057
1058
1059class BBCCoUkPlaylistBaseIE(InfoExtractor):
1060 def _real_extract(self, url):
1061 playlist_id = self._match_id(url)
1062
1063 webpage = self._download_webpage(url, playlist_id)
1064
1065 entries = [
1066 self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1067 for video_id in re.findall(
1068 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)]
1069
1070 title, description = self._extract_title_and_description(webpage)
1071
1072 return self.playlist_result(entries, playlist_id, title, description)
1073
1074
1075class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1076 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1077 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1078 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1079 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1080 _TESTS = [{
ded7511a
S
1081 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1082 'info_dict': {
1083 'id': 'b05rcz9v',
1084 'title': 'The Disappearance',
1085 'description': 'French thriller serial about a missing teenager.',
1086 },
1087 'playlist_mincount': 6,
c6668e4a 1088 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1089 }, {
1090 # Available for over a year unlike 30 days for most other programmes
1091 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1092 'info_dict': {
1093 'id': 'p02tcc32',
1094 'title': 'Bohemian Icons',
1095 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1096 },
1097 'playlist_mincount': 10,
1098 }]
ded7511a
S
1099
1100 def _extract_title_and_description(self, webpage):
1101 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1102 description = self._search_regex(
1103 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1104 webpage, 'description', fatal=False, group='value')
1105 return title, description
1106
1107
1108class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1109 IE_NAME = 'bbc.co.uk:playlist'
1110 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1111 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1112 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1113 _TESTS = [{
1114 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1115 'info_dict': {
1116 'id': 'b05rcz9v',
1117 'title': 'The Disappearance - Clips - BBC Four',
1118 'description': 'French thriller serial about a missing teenager.',
1119 },
1120 'playlist_mincount': 7,
1121 }, {
1122 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1123 'only_matching': True,
1124 }, {
1125 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1126 'only_matching': True,
1127 }, {
1128 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1129 'only_matching': True,
1130 }]
1131
1132 def _extract_title_and_description(self, webpage):
1133 title = self._og_search_title(webpage, fatal=False)
1134 description = self._og_search_description(webpage)
1135 return title, description