]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[videa] Improve and simplify (closes #8181, closes #11133)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
9fb64c04 9 dict_get,
8683b4d8 10 ExtractorError,
9afa1770 11 float_or_none,
8683b4d8 12 int_or_none,
9afa1770
S
13 parse_duration,
14 parse_iso8601,
9fb64c04 15 try_get,
dab062fb 16 unescapeHTML,
8683b4d8 17)
36e6f62c
JMF
18from ..compat import (
19 compat_etree_fromstring,
20 compat_HTTPError,
254e64a2 21 compat_urlparse,
36e6f62c 22)
082c6c86 23
d12a1a47 24
f13b1e7d 25class BBCCoUkIE(InfoExtractor):
082c6c86 26 IE_NAME = 'bbc.co.uk'
2e3fd9ec 27 IE_DESC = 'BBC iPlayer'
22d7368d 28 _ID_REGEX = r'[pb][\da-z]{7}'
f20a11ed
S
29 _VALID_URL = r'''(?x)
30 https?://
31 (?:www\.)?bbc\.co\.uk/
32 (?:
33 programmes/(?!articles/)|
34 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
35 music/clips[/#]|
36 radio/player/
37 )
ded7511a 38 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 39 ''' % _ID_REGEX
082c6c86 40
d12a1a47 41 _MEDIASELECTOR_URLS = [
26ccc68b
S
42 # Provides HQ HLS streams with even better quality that pc mediaset but fails
43 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 44 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 45 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
46 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
47 ]
a8b081a0 48
e6174ee9
S
49 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
50 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
51
52 _NAMESPACES = (
53 _MEDIASELECTION_NS,
54 _EMP_PLAYLIST_NS,
55 )
56
2e3fd9ec
S
57 _TESTS = [
58 {
f2d0fc68 59 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 60 'info_dict': {
f2d0fc68 61 'id': 'b039d07m',
b1ea6802 62 'ext': 'flv',
679bacf0 63 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 64 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
65 },
66 'params': {
b1ea6802 67 # rtmp download
2e3fd9ec
S
68 'skip_download': True,
69 }
082c6c86 70 },
2e3fd9ec
S
71 {
72 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
73 'info_dict': {
74 'id': 'b00yng1d',
75 'ext': 'flv',
76 'title': 'The Man in Black: Series 3: The Printed Name',
77 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
78 'duration': 1800,
79 },
80 'params': {
81 # rtmp download
82 'skip_download': True,
c7f0177f
S
83 },
84 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
85 },
86 {
87 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
88 'info_dict': {
89 'id': 'b00yng1d',
90 'ext': 'flv',
17968e44 91 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 92 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 93 'duration': 5100,
2e3fd9ec
S
94 },
95 'params': {
96 # rtmp download
97 'skip_download': True,
98 },
b1ea6802 99 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
100 },
101 {
102 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
103 'info_dict': {
104 'id': 'b03k3pb7',
105 'ext': 'flv',
106 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
107 'description': '2. Invasion',
108 'duration': 3600,
109 },
110 'params': {
111 # rtmp download
112 'skip_download': True,
113 },
b1ea6802 114 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
115 }, {
116 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
117 'info_dict': {
118 'id': 'b04v209v',
119 'ext': 'flv',
120 'title': 'Pete Tong, The Essential New Tune Special',
121 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
122 'duration': 10800,
123 },
124 'params': {
125 # rtmp download
126 'skip_download': True,
a3ef0e1c
YCH
127 },
128 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 129 }, {
5aa535c3 130 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
131 'note': 'Audio',
132 'info_dict': {
5aa535c3 133 'id': 'p022h44j',
b1ea6802 134 'ext': 'flv',
5aa535c3
S
135 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
136 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
137 'duration': 227,
c7e67594
S
138 },
139 'params': {
b1ea6802 140 # rtmp download
c7e67594
S
141 'skip_download': True,
142 }
143 }, {
144 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
145 'note': 'Video',
146 'info_dict': {
147 'id': 'p025c103',
b1ea6802 148 'ext': 'flv',
c7e67594
S
149 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
150 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
151 'duration': 226,
152 },
153 'params': {
b1ea6802 154 # rtmp download
c7e67594
S
155 'skip_download': True,
156 }
e68ae99a
S
157 }, {
158 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
159 'info_dict': {
160 'id': 'p02n76xf',
161 'ext': 'flv',
162 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
163 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
164 'duration': 3540,
165 },
166 'params': {
167 # rtmp download
168 'skip_download': True,
169 },
b1ea6802 170 'skip': 'geolocation',
25fa8d66
YCH
171 }, {
172 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
173 'info_dict': {
174 'id': 'b05zmgw1',
175 'ext': 'flv',
176 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
177 'title': 'Royal Academy Summer Exhibition',
178 'duration': 3540,
179 },
180 'params': {
181 # rtmp download
182 'skip_download': True,
183 },
b1ea6802 184 'skip': 'geolocation',
54914380
S
185 }, {
186 # iptv-all mediaset fails with geolocation however there is no geo restriction
187 # for this programme at all
5aa535c3 188 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 189 'info_dict': {
5aa535c3 190 'id': 'b06rkms3',
54914380 191 'ext': 'flv',
5aa535c3
S
192 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
193 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
194 },
195 'params': {
196 # rtmp download
197 'skip_download': True,
198 },
b1ea6802 199 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
200 }, {
201 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
202 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
203 'info_dict': {
204 'id': 'p028bfkj',
b1ea6802 205 'ext': 'flv',
1ac6e794
S
206 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
207 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
208 },
209 'params': {
b1ea6802 210 # rtmp download
1ac6e794
S
211 'skip_download': True,
212 },
31763975
S
213 }, {
214 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
215 'only_matching': True,
c7e67594
S
216 }, {
217 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
218 'only_matching': True,
0692ef86
S
219 }, {
220 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
221 'only_matching': True,
f20a11ed
S
222 }, {
223 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
224 'only_matching': True,
ae6986fb 225 }
2e3fd9ec
S
226 ]
227
d12a1a47
S
228 class MediaSelectionError(Exception):
229 def __init__(self, id):
230 self.id = id
231
2e3fd9ec
S
232 def _extract_asx_playlist(self, connection, programme_id):
233 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
234 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
235
2e3fd9ec 236 def _extract_items(self, playlist):
e6174ee9
S
237 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
238
239 def _findall_ns(self, element, xpath):
240 elements = []
241 for ns in self._NAMESPACES:
242 elements.extend(element.findall(xpath % ns))
243 return elements
2e3fd9ec
S
244
245 def _extract_medias(self, media_selection):
e6174ee9
S
246 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
247 if error is None:
248 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 249 if error is not None:
d12a1a47 250 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 251 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
252
253 def _extract_connections(self, media):
e6174ee9 254 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 255
f13b1e7d 256 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
257 subtitles = {}
258 for connection in self._extract_connections(media):
259 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
260 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
261 subtitles[lang] = [
262 {
263 'url': connection.get('href'),
264 'ext': 'ttml',
265 },
f13b1e7d 266 ]
2e3fd9ec 267 return subtitles
082c6c86 268
d12a1a47
S
269 def _raise_extractor_error(self, media_selection_error):
270 raise ExtractorError(
271 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
272 expected=True)
273
c056efa2 274 def _download_media_selector(self, programme_id):
d12a1a47
S
275 last_exception = None
276 for mediaselector_url in self._MEDIASELECTOR_URLS:
277 try:
278 return self._download_media_selector_url(
279 mediaselector_url % programme_id, programme_id)
280 except BBCCoUkIE.MediaSelectionError as e:
d781e293 281 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
282 last_exception = e
283 continue
284 self._raise_extractor_error(e)
285 self._raise_extractor_error(last_exception)
9afa1770
S
286
287 def _download_media_selector_url(self, url, programme_id=None):
c056efa2
S
288 try:
289 media_selection = self._download_xml(
9afa1770 290 url, programme_id, 'Downloading media selection XML')
c056efa2 291 except ExtractorError as ee:
d781e293 292 if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
36e6f62c 293 media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
2e3fd9ec 294 else:
c056efa2 295 raise
9afa1770 296 return self._process_media_selector(media_selection, programme_id)
082c6c86 297
9afa1770 298 def _process_media_selector(self, media_selection, programme_id):
082c6c86 299 formats = []
2e3fd9ec 300 subtitles = None
b0af1215 301 urls = []
2e3fd9ec 302
c056efa2
S
303 for media in self._extract_medias(media_selection):
304 kind = media.get('kind')
a7e5f274
RA
305 if kind in ('video', 'audio'):
306 bitrate = int_or_none(media.get('bitrate'))
307 encoding = media.get('encoding')
308 service = media.get('service')
309 width = int_or_none(media.get('width'))
310 height = int_or_none(media.get('height'))
311 file_size = int_or_none(media.get('media_file_size'))
312 for connection in self._extract_connections(media):
b0af1215
RA
313 href = connection.get('href')
314 if href in urls:
315 continue
316 if href:
317 urls.append(href)
a7e5f274
RA
318 conn_kind = connection.get('kind')
319 protocol = connection.get('protocol')
320 supplier = connection.get('supplier')
a7e5f274
RA
321 transfer_format = connection.get('transferFormat')
322 format_id = supplier or conn_kind or protocol
323 if service:
324 format_id = '%s_%s' % (service, format_id)
325 # ASX playlist
326 if supplier == 'asx':
327 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
328 formats.append({
329 'url': ref,
330 'format_id': 'ref%s_%s' % (i, format_id),
331 })
332 elif transfer_format == 'dash':
333 formats.extend(self._extract_mpd_formats(
334 href, programme_id, mpd_id=format_id, fatal=False))
335 elif transfer_format == 'hls':
336 formats.extend(self._extract_m3u8_formats(
337 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
338 m3u8_id=format_id, fatal=False))
339 elif transfer_format == 'hds':
340 formats.extend(self._extract_f4m_formats(
341 href, programme_id, f4m_id=format_id, fatal=False))
342 else:
f9622868 343 if not service and not supplier and bitrate:
aaa42cf0 344 format_id += '-%d' % bitrate
a7e5f274
RA
345 fmt = {
346 'format_id': format_id,
347 'filesize': file_size,
348 }
349 if kind == 'video':
350 fmt.update({
351 'width': width,
352 'height': height,
353 'vbr': bitrate,
354 'vcodec': encoding,
355 })
356 else:
357 fmt.update({
358 'abr': bitrate,
359 'acodec': encoding,
360 'vcodec': 'none',
361 })
362 if protocol == 'http':
363 # Direct link
364 fmt.update({
365 'url': href,
366 })
367 elif protocol == 'rtmp':
368 application = connection.get('application', 'ondemand')
369 auth_string = connection.get('authString')
370 identifier = connection.get('identifier')
371 server = connection.get('server')
372 fmt.update({
373 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
374 'play_path': identifier,
375 'app': '%s?%s' % (application, auth_string),
376 'page_url': 'http://www.bbc.co.uk',
377 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
378 'rtmp_live': False,
379 'ext': 'flv',
380 })
381 formats.append(fmt)
c056efa2 382 elif kind == 'captions':
f13b1e7d 383 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 384 return formats, subtitles
2e3fd9ec 385
ae6986fb
S
386 def _download_playlist(self, playlist_id):
387 try:
388 playlist = self._download_json(
389 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
390 playlist_id, 'Downloading playlist JSON')
391
392 version = playlist.get('defaultAvailableVersion')
393 if version:
394 smp_config = version['smpConfig']
395 title = smp_config['title']
396 description = smp_config['summary']
397 for item in smp_config['items']:
398 kind = item['kind']
399 if kind != 'programme' and kind != 'radioProgramme':
400 continue
401 programme_id = item.get('vpid')
d97f5cd7 402 duration = int_or_none(item.get('duration'))
ae6986fb
S
403 formats, subtitles = self._download_media_selector(programme_id)
404 return programme_id, title, description, duration, formats, subtitles
405 except ExtractorError as ee:
f813928e 406 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
407 raise
408
409 # fallback to legacy playlist
9afa1770
S
410 return self._process_legacy_playlist(playlist_id)
411
412 def _process_legacy_playlist_url(self, url, display_id):
413 playlist = self._download_legacy_playlist_url(url, display_id)
414 return self._extract_from_legacy_playlist(playlist, display_id)
415
416 def _process_legacy_playlist(self, playlist_id):
417 return self._process_legacy_playlist_url(
418 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
419
420 def _download_legacy_playlist_url(self, url, playlist_id=None):
421 return self._download_xml(
422 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 423
9afa1770 424 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 425 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
426 if no_items is not None:
427 reason = no_items.get('reason')
428 if reason == 'preAvailability':
429 msg = 'Episode %s is not yet available' % playlist_id
430 elif reason == 'postAvailability':
431 msg = 'Episode %s is no longer available' % playlist_id
432 elif reason == 'noMedia':
433 msg = 'Episode %s is not currently available' % playlist_id
434 else:
435 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
436 raise ExtractorError(msg, expected=True)
437
438 for item in self._extract_items(playlist):
439 kind = item.get('kind')
440 if kind != 'programme' and kind != 'radioProgramme':
441 continue
e6174ee9
S
442 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
443 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 444 description = description_el.text if description_el is not None else None
9afa1770
S
445
446 def get_programme_id(item):
447 def get_from_attributes(item):
448 for p in('identifier', 'group'):
449 value = item.get(p)
450 if value and re.match(r'^[pb][\da-z]{7}$', value):
451 return value
452 get_from_attributes(item)
e6174ee9 453 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
454 if mediator is not None:
455 return get_from_attributes(mediator)
456
457 programme_id = get_programme_id(item)
d97f5cd7 458 duration = int_or_none(item.get('duration'))
e6174ee9
S
459
460 if programme_id:
461 formats, subtitles = self._download_media_selector(programme_id)
462 else:
463 formats, subtitles = self._process_media_selector(item, playlist_id)
464 programme_id = playlist_id
ae6986fb
S
465
466 return programme_id, title, description, duration, formats, subtitles
467
c056efa2
S
468 def _real_extract(self, url):
469 group_id = self._match_id(url)
470
471 webpage = self._download_webpage(url, group_id, 'Downloading video page')
472
8683b4d8 473 programme_id = None
679bacf0 474 duration = None
8683b4d8
S
475
476 tviplayer = self._search_regex(
477 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
478 webpage, 'player', default=None)
479
480 if tviplayer:
481 player = self._parse_json(tviplayer, group_id).get('player', {})
482 duration = int_or_none(player.get('duration'))
483 programme_id = player.get('vpid')
484
485 if not programme_id:
486 programme_id = self._search_regex(
22d7368d 487 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 488
c056efa2 489 if programme_id:
c056efa2 490 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 491 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
492 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
493 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 494 description = self._search_regex(
a8534274
S
495 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
496 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
497 webpage, 'description', default=None)
498 if not description:
499 description = self._html_search_meta('description', webpage)
c056efa2 500 else:
ae6986fb 501 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 502
082c6c86
S
503 self._sort_formats(formats)
504
505 return {
2e3fd9ec 506 'id': programme_id,
082c6c86
S
507 'title': title,
508 'description': description,
650cfd0c 509 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
510 'duration': duration,
511 'formats': formats,
2e3fd9ec 512 'subtitles': subtitles,
5f6a1245 513 }
10273d6e 514
515
9afa1770
S
516class BBCIE(BBCCoUkIE):
517 IE_NAME = 'bbc'
518 IE_DESC = 'BBC'
519 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 520
d12a1a47 521 _MEDIASELECTOR_URLS = [
55ebae26
S
522 # Provides HQ HLS streams but fails with geolocation in some cases when it's
523 # even not geo restricted at all
524 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
525 # Provides more formats, namely direct mp4 links, but fails on some videos with
526 # notukerror for non UK (?) users (e.g.
527 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
528 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
529 # Provides fewer formats, but works everywhere for everybody (hopefully)
530 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
531 ]
10273d6e 532
533 _TESTS = [{
6a747190 534 # article with multiple videos embedded with data-playable containing vpids
10273d6e 535 'url': 'http://www.bbc.com/news/world-europe-32668511',
536 'info_dict': {
537 'id': 'world-europe-32668511',
538 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 539 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 540 },
541 'playlist_count': 2,
a3bfddfa 542 }, {
6a747190 543 # article with multiple videos embedded with data-playable (more videos)
10273d6e 544 'url': 'http://www.bbc.com/news/business-28299555',
545 'info_dict': {
546 'id': 'business-28299555',
547 'title': 'Farnborough Airshow: Video highlights',
9afa1770 548 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 549 },
550 'playlist_count': 9,
9afa1770 551 'skip': 'Save time',
88ed52ae
S
552 }, {
553 # article with multiple videos embedded with `new SMP()`
6a747190 554 # broken
88ed52ae
S
555 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
556 'info_dict': {
557 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 558 'title': 'BUGGER',
88ed52ae
S
559 },
560 'playlist_count': 18,
a3bfddfa 561 }, {
6a747190 562 # single video embedded with data-playable containing vpid
10273d6e 563 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 564 'info_dict': {
565 'id': 'p02mprgb',
55ebae26 566 'ext': 'mp4',
10273d6e 567 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 568 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 569 'duration': 47,
9afa1770 570 'timestamp': 1427219242,
da92eeae 571 'upload_date': '20150324',
10273d6e 572 },
573 'params': {
9afa1770 574 # rtmp download
10273d6e 575 'skip_download': True,
576 }
a3bfddfa 577 }, {
6a747190
S
578 # article with single video embedded with data-playable containing XML playlist
579 # with direct video links as progressiveDownloadUrl (for now these are extracted)
580 # and playlist with f4m and m3u8 as streamingUrl
de939d89 581 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 582 'info_dict': {
9afa1770 583 'id': '150615_telabyad_kentin_cogu',
de939d89 584 'ext': 'mp4',
ad152e2d 585 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 586 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 587 'timestamp': 1434397334,
da92eeae 588 'upload_date': '20150615',
de939d89 589 },
590 'params': {
591 'skip_download': True,
592 }
c936d8cc 593 }, {
6a747190 594 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 595 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 596 'info_dict': {
9afa1770 597 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 598 'ext': 'mp4',
9afa1770 599 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 600 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 601 'timestamp': 1434713142,
da92eeae 602 'upload_date': '20150619',
de939d89 603 },
604 'params': {
605 'skip_download': True,
606 }
a346b1ff
S
607 }, {
608 # single video from video playlist embedded with vxp-playlist-data JSON
609 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
610 'info_dict': {
611 'id': 'p02w6qjc',
55ebae26 612 'ext': 'mp4',
a346b1ff
S
613 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
614 'duration': 56,
0bc4ee60 615 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
616 },
617 'params': {
618 'skip_download': True,
619 }
9afa1770
S
620 }, {
621 # single video story with digitalData
622 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
623 'info_dict': {
624 'id': 'p02q6gc4',
625 'ext': 'flv',
626 'title': 'Sri Lanka’s spicy secret',
627 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
628 'timestamp': 1437674293,
629 'upload_date': '20150723',
630 },
631 'params': {
632 # rtmp download
633 'skip_download': True,
634 }
635 }, {
636 # single video story without digitalData
637 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
638 'info_dict': {
639 'id': 'p018zqqg',
55ebae26 640 'ext': 'mp4',
9afa1770
S
641 'title': 'Hyundai Santa Fe Sport: Rock star',
642 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
643 'timestamp': 1415867444,
644 'upload_date': '20141113',
9afa1770
S
645 },
646 'params': {
647 # rtmp download
648 'skip_download': True,
649 }
9fb64c04
S
650 }, {
651 # single video embedded with Morph
652 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
653 'info_dict': {
654 'id': 'p041vhd0',
655 'ext': 'mp4',
656 'title': "Nigeria v Japan - Men's First Round",
657 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
658 'duration': 7980,
659 'uploader': 'BBC Sport',
660 'uploader_id': 'bbc_sport',
661 },
662 'params': {
663 # m3u8 download
664 'skip_download': True,
9fb64c04
S
665 },
666 'skip': 'Georestricted to UK',
9afa1770 667 }, {
6a747190 668 # single video with playlist.sxml URL in playlist param
9afa1770
S
669 'url': 'http://www.bbc.com/sport/0/football/33653409',
670 'info_dict': {
671 'id': 'p02xycnp',
55ebae26 672 'ext': 'mp4',
9afa1770 673 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 674 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
675 'duration': 140,
676 },
677 'params': {
678 # rtmp download
679 'skip_download': True,
680 }
b5d48cb1 681 }, {
6a747190 682 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
683 'url': 'http://www.bbc.com/sport/0/football/34475836',
684 'info_dict': {
685 'id': '34475836',
450b233c 686 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 687 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
688 },
689 'playlist_count': 3,
450b233c
S
690 }, {
691 # school report article with single video
692 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
693 'info_dict': {
694 'id': '35744779',
695 'title': 'School which breaks down barriers in Jerusalem',
696 },
697 'playlist_count': 1,
9afa1770
S
698 }, {
699 # single video with playlist URL from weather section
700 'url': 'http://www.bbc.com/weather/features/33601775',
701 'only_matching': True,
702 }, {
703 # custom redirection to www.bbc.com
704 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
705 'only_matching': True,
a1cf3e38
S
706 }, {
707 # single video article embedded with data-media-vpid
708 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
709 'only_matching': True,
10273d6e 710 }]
711
9afa1770
S
712 @classmethod
713 def suitable(cls, url):
ded7511a
S
714 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
715 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
716 else super(BBCIE, cls).suitable(url))
9afa1770
S
717
718 def _extract_from_media_meta(self, media_meta, video_id):
719 # Direct links to media in media metadata (e.g.
720 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
721 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
722 source_files = media_meta.get('sourceFiles')
723 if source_files:
724 return [{
725 'url': f['url'],
726 'format_id': format_id,
727 'ext': f.get('encoding'),
728 'tbr': float_or_none(f.get('bitrate'), 1000),
729 'filesize': int_or_none(f.get('filesize')),
730 } for format_id, f in source_files.items() if f.get('url')], []
731
732 programme_id = media_meta.get('externalId')
733 if programme_id:
734 return self._download_media_selector(programme_id)
735
736 # Process playlist.sxml as legacy playlist
737 href = media_meta.get('href')
738 if href:
739 playlist = self._download_legacy_playlist_url(href)
740 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
741 return formats, subtitles
742
743 return [], []
744
baf39a1a
S
745 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
746 programme_id, title, description, duration, formats, subtitles = \
747 self._process_legacy_playlist_url(url, playlist_id)
748 self._sort_formats(formats)
749 return {
750 'id': programme_id,
751 'title': title,
752 'description': description,
753 'duration': duration,
754 'timestamp': timestamp,
755 'formats': formats,
756 'subtitles': subtitles,
757 }
758
10273d6e 759 def _real_extract(self, url):
9afa1770
S
760 playlist_id = self._match_id(url)
761
762 webpage = self._download_webpage(url, playlist_id)
763
522f6c06 764 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 765 timestamp = json_ld_info.get('timestamp')
0e832c2c 766
350e02d4 767 playlist_title = json_ld_info.get('title')
0e832c2c
S
768 if not playlist_title:
769 playlist_title = self._og_search_title(
770 webpage, default=None) or self._html_search_regex(
771 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
772 if playlist_title:
773 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
774
775 playlist_description = json_ld_info.get(
776 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
777
778 if not timestamp:
779 timestamp = parse_iso8601(self._search_regex(
780 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
781 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 782 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 783 webpage, 'date', default=None))
9afa1770 784
78f9d843
S
785 entries = []
786
de665713
S
787 # article with multiple videos embedded with playlist.sxml (e.g.
788 # http://www.bbc.com/sport/0/football/34475836)
789 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 790 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 791 if playlists:
baf39a1a
S
792 entries = [
793 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
794 for playlist_url in playlists]
de939d89 795
78f9d843
S
796 # news article with multiple videos embedded with data-playable
797 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
798 if data_playables:
799 for _, data_playable_json in data_playables:
800 data_playable = self._parse_json(
801 unescapeHTML(data_playable_json), playlist_id, fatal=False)
802 if not data_playable:
803 continue
baf39a1a
S
804 settings = data_playable.get('settings', {})
805 if settings:
78f9d843
S
806 # data-playable with video vpid in settings.playlistObject.items (e.g.
807 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
808 playlist_object = settings.get('playlistObject', {})
809 if playlist_object:
810 items = playlist_object.get('items')
811 if items and isinstance(items, list):
78f9d843
S
812 title = playlist_object['title']
813 description = playlist_object.get('summary')
baf39a1a
S
814 duration = int_or_none(items[0].get('duration'))
815 programme_id = items[0].get('vpid')
78f9d843
S
816 formats, subtitles = self._download_media_selector(programme_id)
817 self._sort_formats(formats)
818 entries.append({
819 'id': programme_id,
820 'title': title,
821 'description': description,
822 'timestamp': timestamp,
823 'duration': duration,
824 'formats': formats,
825 'subtitles': subtitles,
826 })
827 else:
828 # data-playable without vpid but with a playlist.sxml URLs
829 # in otherSettings.playlist (e.g.
830 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
831 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
832 if playlist:
a7e5f274
RA
833 entry = None
834 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
835 playlist_url = playlist.get('%sUrl' % key)
836 if not playlist_url:
837 continue
838 try:
a7e5f274
RA
839 info = self._extract_from_playlist_sxml(
840 playlist_url, playlist_id, timestamp)
841 if not entry:
842 entry = info
843 else:
844 entry['title'] = info['title']
845 entry['formats'].extend(info['formats'])
05087d1b
S
846 except Exception as e:
847 # Some playlist URL may fail with 500, at the same time
848 # the other one may work fine (e.g.
849 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
850 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
851 continue
852 raise
a7e5f274
RA
853 if entry:
854 self._sort_formats(entry['formats'])
855 entries.append(entry)
78f9d843
S
856
857 if entries:
78f9d843
S
858 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
859
860 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
861 programme_id = self._search_regex(
a1cf3e38 862 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
863 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
864 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 865 webpage, 'vpid', default=None)
dab062fb 866
9afa1770
S
867 if programme_id:
868 formats, subtitles = self._download_media_selector(programme_id)
869 self._sort_formats(formats)
870 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
871 digital_data = self._parse_json(
872 self._search_regex(
873 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
874 programme_id, fatal=False)
875 page_info = digital_data.get('page', {}).get('pageInfo', {})
876 title = page_info.get('pageName') or self._og_search_title(webpage)
877 description = page_info.get('description') or self._og_search_description(webpage)
878 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
879 return {
880 'id': programme_id,
881 'title': title,
882 'description': description,
883 'timestamp': timestamp,
884 'formats': formats,
885 'subtitles': subtitles,
886 }
a3bfddfa 887
9fb64c04
S
888 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
889 # There are several setPayload calls may be present but the video
890 # seems to be always related to the first one
891 morph_payload = self._parse_json(
892 self._search_regex(
893 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
894 webpage, 'morph payload', default='{}'),
895 playlist_id, fatal=False)
896 if morph_payload:
897 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
898 for component in components:
899 if not isinstance(component, dict):
900 continue
901 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
902 if not lead_media:
903 continue
904 identifiers = lead_media.get('identifiers')
905 if not identifiers or not isinstance(identifiers, dict):
906 continue
907 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
908 if not programme_id:
909 continue
910 title = lead_media.get('title') or self._og_search_title(webpage)
911 formats, subtitles = self._download_media_selector(programme_id)
912 self._sort_formats(formats)
913 description = lead_media.get('summary')
914 uploader = lead_media.get('masterBrand')
915 uploader_id = lead_media.get('mid')
916 duration = None
917 duration_d = lead_media.get('duration')
918 if isinstance(duration_d, dict):
919 duration = parse_duration(dict_get(
920 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
921 return {
922 'id': programme_id,
923 'title': title,
924 'description': description,
925 'duration': duration,
926 'uploader': uploader,
927 'uploader_id': uploader_id,
928 'formats': formats,
929 'subtitles': subtitles,
930 }
931
88ed52ae
S
932 def extract_all(pattern):
933 return list(filter(None, map(
934 lambda s: self._parse_json(s, playlist_id, fatal=False),
935 re.findall(pattern, webpage))))
936
937 # Multiple video article (e.g.
938 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 939 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
940 entries = []
941 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
942 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
943 if embed_url and re.match(EMBED_URL, embed_url):
944 entries.append(embed_url)
945 entries.extend(re.findall(
946 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
947 if entries:
948 return self.playlist_result(
aaa42cf0 949 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 950 playlist_id, playlist_title, playlist_description)
9afa1770
S
951
952 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 953 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
954
955 if not medias:
956 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
957 media_asset = self._search_regex(
958 r'mediaAssetPage\.init\(\s*({.+?}), "/',
959 webpage, 'media asset', default=None)
960 if media_asset:
961 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
962 medias = []
963 for video in media_asset_page.get('videos', {}).values():
964 medias.extend(video.values())
965
966 if not medias:
967 # Multiple video playlist with single `now playing` entry (e.g.
968 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
969 vxp_playlist = self._parse_json(
9afa1770 970 self._search_regex(
a346b1ff
S
971 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
972 webpage, 'playlist data'),
9afa1770 973 playlist_id)
a346b1ff
S
974 playlist_medias = []
975 for item in vxp_playlist:
976 media = item.get('media')
977 if not media:
978 continue
979 playlist_medias.append(media)
980 # Download single video if found media with asset id matching the video id from URL
981 if item.get('advert', {}).get('assetId') == playlist_id:
982 medias = [media]
983 break
984 # Fallback to the whole playlist
985 if not medias:
986 medias = playlist_medias
9afa1770
S
987
988 entries = []
989 for num, media_meta in enumerate(medias, start=1):
990 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
991 if not formats:
992 continue
10273d6e 993 self._sort_formats(formats)
994
9afa1770
S
995 video_id = media_meta.get('externalId')
996 if not video_id:
997 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
998
999 title = media_meta.get('caption')
1000 if not title:
1001 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1002
1003 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1004
9afa1770
S
1005 images = []
1006 for image in media_meta.get('images', {}).values():
1007 images.extend(image.values())
1008 if 'image' in media_meta:
1009 images.append(media_meta['image'])
1010
1011 thumbnails = [{
1012 'url': image.get('href'),
1013 'width': int_or_none(image.get('width')),
1014 'height': int_or_none(image.get('height')),
1015 } for image in images]
1016
1017 entries.append({
1018 'id': video_id,
10273d6e 1019 'title': title,
9afa1770 1020 'thumbnails': thumbnails,
10273d6e 1021 'duration': duration,
9afa1770 1022 'timestamp': timestamp,
10273d6e 1023 'formats': formats,
1024 'subtitles': subtitles,
a3bfddfa 1025 })
10273d6e 1026
9afa1770 1027 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1028
1029
1030class BBCCoUkArticleIE(InfoExtractor):
92519402 1031 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1032 IE_NAME = 'bbc.co.uk:article'
1033 IE_DESC = 'BBC articles'
1034
1035 _TEST = {
1036 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1037 'info_dict': {
1038 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1039 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1040 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1041 },
1042 'playlist_count': 4,
1043 'add_ie': ['BBCCoUk'],
1044 }
1045
1046 def _real_extract(self, url):
1047 playlist_id = self._match_id(url)
1048
1049 webpage = self._download_webpage(url, playlist_id)
1050
1051 title = self._og_search_title(webpage)
1052 description = self._og_search_description(webpage).strip()
1053
1054 entries = [self.url_result(programme_url) for programme_url in re.findall(
1055 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1056
1057 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1058
1059
1060class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1061 def _entries(self, webpage, url, playlist_id):
1062 single_page = 'page' in compat_urlparse.parse_qs(
1063 compat_urlparse.urlparse(url).query)
1064 for page_num in itertools.count(2):
1065 for video_id in re.findall(
1066 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1067 yield self.url_result(
1068 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1069 if single_page:
1070 return
1071 next_page = self._search_regex(
1072 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1073 webpage, 'next page url', default=None, group='url')
1074 if not next_page:
1075 break
1076 webpage = self._download_webpage(
1077 compat_urlparse.urljoin(url, next_page), playlist_id,
1078 'Downloading page %d' % page_num, page_num)
1079
ded7511a
S
1080 def _real_extract(self, url):
1081 playlist_id = self._match_id(url)
1082
1083 webpage = self._download_webpage(url, playlist_id)
1084
ded7511a
S
1085 title, description = self._extract_title_and_description(webpage)
1086
254e64a2
S
1087 return self.playlist_result(
1088 self._entries(webpage, url, playlist_id),
1089 playlist_id, title, description)
ded7511a
S
1090
1091
1092class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1093 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1094 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1095 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1096 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1097 _TESTS = [{
ded7511a
S
1098 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1099 'info_dict': {
1100 'id': 'b05rcz9v',
1101 'title': 'The Disappearance',
1102 'description': 'French thriller serial about a missing teenager.',
1103 },
1104 'playlist_mincount': 6,
c6668e4a 1105 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1106 }, {
1107 # Available for over a year unlike 30 days for most other programmes
1108 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1109 'info_dict': {
1110 'id': 'p02tcc32',
1111 'title': 'Bohemian Icons',
1112 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1113 },
1114 'playlist_mincount': 10,
1115 }]
ded7511a
S
1116
1117 def _extract_title_and_description(self, webpage):
1118 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1119 description = self._search_regex(
1120 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1121 webpage, 'description', fatal=False, group='value')
1122 return title, description
1123
1124
1125class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1126 IE_NAME = 'bbc.co.uk:playlist'
1127 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1128 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1129 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1130 _TESTS = [{
1131 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1132 'info_dict': {
1133 'id': 'b05rcz9v',
1134 'title': 'The Disappearance - Clips - BBC Four',
1135 'description': 'French thriller serial about a missing teenager.',
1136 },
1137 'playlist_mincount': 7,
4f640f28
S
1138 }, {
1139 # multipage playlist, explicit page
1140 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1141 'info_dict': {
1142 'id': 'b00mfl7n',
1143 'title': 'Frozen Planet - Clips - BBC One',
1144 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1145 },
1146 'playlist_mincount': 24,
1147 }, {
1148 # multipage playlist, all pages
1149 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1150 'info_dict': {
1151 'id': 'b00mfl7n',
1152 'title': 'Frozen Planet - Clips - BBC One',
1153 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1154 },
1155 'playlist_mincount': 142,
ded7511a
S
1156 }, {
1157 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1158 'only_matching': True,
1159 }, {
1160 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1161 'only_matching': True,
1162 }, {
1163 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1164 'only_matching': True,
1165 }]
1166
1167 def _extract_title_and_description(self, webpage):
1168 title = self._og_search_title(webpage, fatal=False)
1169 description = self._og_search_description(webpage)
1170 return title, description