]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/bbc.py
[iwara] Fix download URLs (closes #17026)
[yt-dlp.git] / youtube_dl / extractor / bbc.py
CommitLineData
9afa1770 1# coding: utf-8
082c6c86
S
2from __future__ import unicode_literals
3
9afa1770 4import re
254e64a2 5import itertools
082c6c86 6
f13b1e7d 7from .common import InfoExtractor
8683b4d8 8from ..utils import (
97067db2 9 clean_html,
9fb64c04 10 dict_get,
8683b4d8 11 ExtractorError,
9afa1770 12 float_or_none,
97067db2 13 get_element_by_class,
8683b4d8 14 int_or_none,
6d155707 15 js_to_json,
9afa1770
S
16 parse_duration,
17 parse_iso8601,
9fb64c04 18 try_get,
dab062fb 19 unescapeHTML,
97067db2
S
20 urlencode_postdata,
21 urljoin,
8683b4d8 22)
36e6f62c 23from ..compat import (
36e6f62c 24 compat_HTTPError,
254e64a2 25 compat_urlparse,
36e6f62c 26)
082c6c86 27
d12a1a47 28
f13b1e7d 29class BBCCoUkIE(InfoExtractor):
082c6c86 30 IE_NAME = 'bbc.co.uk'
2e3fd9ec 31 IE_DESC = 'BBC iPlayer'
53647dfd 32 _ID_REGEX = r'[pbw][\da-z]{7}'
f20a11ed
S
33 _VALID_URL = r'''(?x)
34 https?://
35 (?:www\.)?bbc\.co\.uk/
36 (?:
37 programmes/(?!articles/)|
38 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
72d256c4 39 music/(?:clips|audiovideo/popular)[/#]|
d3d45e0a
S
40 radio/player/|
41 events/[^/]+/play/[^/]+/
f20a11ed 42 )
ded7511a 43 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
f20a11ed 44 ''' % _ID_REGEX
082c6c86 45
97067db2
S
46 _LOGIN_URL = 'https://account.bbc.com/signin'
47 _NETRC_MACHINE = 'bbc'
48
d12a1a47 49 _MEDIASELECTOR_URLS = [
26ccc68b
S
50 # Provides HQ HLS streams with even better quality that pc mediaset but fails
51 # with geolocation in some cases when it's even not geo restricted at all (e.g.
d781e293 52 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
d1c694ea 53 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
54 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
55 ]
a8b081a0 56
e6174ee9
S
57 _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
58 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
59
60 _NAMESPACES = (
61 _MEDIASELECTION_NS,
62 _EMP_PLAYLIST_NS,
63 )
64
2e3fd9ec
S
65 _TESTS = [
66 {
f2d0fc68 67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
2e3fd9ec 68 'info_dict': {
f2d0fc68 69 'id': 'b039d07m',
b1ea6802 70 'ext': 'flv',
679bacf0 71 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
c4914185 72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
2e3fd9ec
S
73 },
74 'params': {
b1ea6802 75 # rtmp download
2e3fd9ec
S
76 'skip_download': True,
77 }
082c6c86 78 },
2e3fd9ec
S
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
c7f0177f
S
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
2e3fd9ec
S
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
17968e44 99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
611c1dd9 100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
17968e44 101 'duration': 5100,
2e3fd9ec
S
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
b1ea6802 107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
c056efa2
S
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
b1ea6802 122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
ae6986fb
S
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
a3ef0e1c
YCH
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
c7e67594 137 }, {
5aa535c3 138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
c7e67594
S
139 'note': 'Audio',
140 'info_dict': {
5aa535c3 141 'id': 'p022h44j',
b1ea6802 142 'ext': 'flv',
5aa535c3
S
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
c7e67594
S
146 },
147 'params': {
b1ea6802 148 # rtmp download
c7e67594
S
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
b1ea6802 156 'ext': 'flv',
c7e67594
S
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
b1ea6802 162 # rtmp download
c7e67594
S
163 'skip_download': True,
164 }
e68ae99a
S
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
b1ea6802 178 'skip': 'geolocation',
25fa8d66
YCH
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
b1ea6802 192 'skip': 'geolocation',
54914380
S
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
5aa535c3 196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
54914380 197 'info_dict': {
5aa535c3 198 'id': 'b06rkms3',
54914380 199 'ext': 'flv',
5aa535c3
S
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
54914380
S
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
b1ea6802 207 'skip': 'Now it\'s really geo-restricted',
1ac6e794
S
208 }, {
209 # compact player (https://github.com/rg3/youtube-dl/issues/8147)
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
b1ea6802 213 'ext': 'flv',
1ac6e794
S
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
b1ea6802 218 # rtmp download
1ac6e794
S
219 'skip_download': True,
220 },
31763975
S
221 }, {
222 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
223 'only_matching': True,
c7e67594
S
224 }, {
225 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
226 'only_matching': True,
0692ef86
S
227 }, {
228 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
229 'only_matching': True,
f20a11ed
S
230 }, {
231 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
232 'only_matching': True,
72d256c4
S
233 }, {
234 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
235 'only_matching': True,
53647dfd
S
236 }, {
237 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
238 'only_matching': True,
72d256c4 239 }]
2e3fd9ec 240
97eb9bd2
RA
241 _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
242
97067db2
S
243 def _login(self):
244 username, password = self._get_login_info()
245 if username is None:
246 return
247
248 login_page = self._download_webpage(
249 self._LOGIN_URL, None, 'Downloading signin page')
250
251 login_form = self._hidden_inputs(login_page)
252
253 login_form.update({
254 'username': username,
255 'password': password,
256 })
257
258 post_url = urljoin(self._LOGIN_URL, self._search_regex(
259 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
260 'post url', default=self._LOGIN_URL, group='url'))
261
262 response, urlh = self._download_webpage_handle(
263 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
264 headers={'Referer': self._LOGIN_URL})
265
266 if self._LOGIN_URL in urlh.geturl():
267 error = clean_html(get_element_by_class('form-message', response))
268 if error:
269 raise ExtractorError(
270 'Unable to login: %s' % error, expected=True)
271 raise ExtractorError('Unable to log in')
272
273 def _real_initialize(self):
274 self._login()
275
d12a1a47
S
276 class MediaSelectionError(Exception):
277 def __init__(self, id):
278 self.id = id
279
2e3fd9ec
S
280 def _extract_asx_playlist(self, connection, programme_id):
281 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
282 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
283
2e3fd9ec 284 def _extract_items(self, playlist):
e6174ee9
S
285 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
286
287 def _findall_ns(self, element, xpath):
288 elements = []
289 for ns in self._NAMESPACES:
290 elements.extend(element.findall(xpath % ns))
291 return elements
2e3fd9ec
S
292
293 def _extract_medias(self, media_selection):
e6174ee9
S
294 error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
295 if error is None:
296 media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
c056efa2 297 if error is not None:
d12a1a47 298 raise BBCCoUkIE.MediaSelectionError(error.get('id'))
e6174ee9 299 return self._findall_ns(media_selection, './{%s}media')
2e3fd9ec
S
300
301 def _extract_connections(self, media):
e6174ee9 302 return self._findall_ns(media, './{%s}connection')
2e3fd9ec 303
f13b1e7d 304 def _get_subtitles(self, media, programme_id):
2e3fd9ec
S
305 subtitles = {}
306 for connection in self._extract_connections(media):
307 captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
308 lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
f13b1e7d
JMF
309 subtitles[lang] = [
310 {
311 'url': connection.get('href'),
312 'ext': 'ttml',
313 },
f13b1e7d 314 ]
2e3fd9ec 315 return subtitles
082c6c86 316
d12a1a47
S
317 def _raise_extractor_error(self, media_selection_error):
318 raise ExtractorError(
319 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
320 expected=True)
321
c056efa2 322 def _download_media_selector(self, programme_id):
d12a1a47
S
323 last_exception = None
324 for mediaselector_url in self._MEDIASELECTOR_URLS:
325 try:
326 return self._download_media_selector_url(
327 mediaselector_url % programme_id, programme_id)
328 except BBCCoUkIE.MediaSelectionError as e:
d781e293 329 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
d12a1a47
S
330 last_exception = e
331 continue
332 self._raise_extractor_error(e)
333 self._raise_extractor_error(last_exception)
9afa1770
S
334
335 def _download_media_selector_url(self, url, programme_id=None):
9283d4ea
S
336 media_selection = self._download_xml(
337 url, programme_id, 'Downloading media selection XML',
338 expected_status=(403, 404))
9afa1770 339 return self._process_media_selector(media_selection, programme_id)
082c6c86 340
9afa1770 341 def _process_media_selector(self, media_selection, programme_id):
082c6c86 342 formats = []
2e3fd9ec 343 subtitles = None
b0af1215 344 urls = []
2e3fd9ec 345
c056efa2
S
346 for media in self._extract_medias(media_selection):
347 kind = media.get('kind')
a7e5f274
RA
348 if kind in ('video', 'audio'):
349 bitrate = int_or_none(media.get('bitrate'))
350 encoding = media.get('encoding')
351 service = media.get('service')
352 width = int_or_none(media.get('width'))
353 height = int_or_none(media.get('height'))
354 file_size = int_or_none(media.get('media_file_size'))
355 for connection in self._extract_connections(media):
b0af1215
RA
356 href = connection.get('href')
357 if href in urls:
358 continue
359 if href:
360 urls.append(href)
a7e5f274
RA
361 conn_kind = connection.get('kind')
362 protocol = connection.get('protocol')
363 supplier = connection.get('supplier')
a7e5f274
RA
364 transfer_format = connection.get('transferFormat')
365 format_id = supplier or conn_kind or protocol
366 if service:
367 format_id = '%s_%s' % (service, format_id)
368 # ASX playlist
369 if supplier == 'asx':
370 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
371 formats.append({
372 'url': ref,
373 'format_id': 'ref%s_%s' % (i, format_id),
374 })
375 elif transfer_format == 'dash':
376 formats.extend(self._extract_mpd_formats(
377 href, programme_id, mpd_id=format_id, fatal=False))
378 elif transfer_format == 'hls':
379 formats.extend(self._extract_m3u8_formats(
380 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
381 m3u8_id=format_id, fatal=False))
97eb9bd2
RA
382 if re.search(self._USP_RE, href):
383 usp_formats = self._extract_m3u8_formats(
6b2d8c91 384 re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
97eb9bd2
RA
385 programme_id, ext='mp4', entry_protocol='m3u8_native',
386 m3u8_id=format_id, fatal=False)
387 for f in usp_formats:
388 if f.get('height') and f['height'] > 720:
389 continue
390 formats.append(f)
a7e5f274
RA
391 elif transfer_format == 'hds':
392 formats.extend(self._extract_f4m_formats(
393 href, programme_id, f4m_id=format_id, fatal=False))
394 else:
f9622868 395 if not service and not supplier and bitrate:
aaa42cf0 396 format_id += '-%d' % bitrate
a7e5f274
RA
397 fmt = {
398 'format_id': format_id,
399 'filesize': file_size,
400 }
401 if kind == 'video':
402 fmt.update({
403 'width': width,
404 'height': height,
6240925b 405 'tbr': bitrate,
a7e5f274
RA
406 'vcodec': encoding,
407 })
408 else:
409 fmt.update({
410 'abr': bitrate,
411 'acodec': encoding,
412 'vcodec': 'none',
413 })
1af959ef 414 if protocol in ('http', 'https'):
a7e5f274
RA
415 # Direct link
416 fmt.update({
417 'url': href,
418 })
419 elif protocol == 'rtmp':
420 application = connection.get('application', 'ondemand')
421 auth_string = connection.get('authString')
422 identifier = connection.get('identifier')
423 server = connection.get('server')
424 fmt.update({
425 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
426 'play_path': identifier,
427 'app': '%s?%s' % (application, auth_string),
428 'page_url': 'http://www.bbc.co.uk',
429 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
430 'rtmp_live': False,
431 'ext': 'flv',
432 })
964744af
S
433 else:
434 continue
a7e5f274 435 formats.append(fmt)
c056efa2 436 elif kind == 'captions':
f13b1e7d 437 subtitles = self.extract_subtitles(media, programme_id)
c056efa2 438 return formats, subtitles
2e3fd9ec 439
ae6986fb
S
440 def _download_playlist(self, playlist_id):
441 try:
442 playlist = self._download_json(
443 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
444 playlist_id, 'Downloading playlist JSON')
445
446 version = playlist.get('defaultAvailableVersion')
447 if version:
448 smp_config = version['smpConfig']
449 title = smp_config['title']
450 description = smp_config['summary']
451 for item in smp_config['items']:
452 kind = item['kind']
40fcba5e 453 if kind not in ('programme', 'radioProgramme'):
ae6986fb
S
454 continue
455 programme_id = item.get('vpid')
d97f5cd7 456 duration = int_or_none(item.get('duration'))
ae6986fb
S
457 formats, subtitles = self._download_media_selector(programme_id)
458 return programme_id, title, description, duration, formats, subtitles
459 except ExtractorError as ee:
f813928e 460 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
ae6986fb
S
461 raise
462
463 # fallback to legacy playlist
9afa1770
S
464 return self._process_legacy_playlist(playlist_id)
465
466 def _process_legacy_playlist_url(self, url, display_id):
467 playlist = self._download_legacy_playlist_url(url, display_id)
468 return self._extract_from_legacy_playlist(playlist, display_id)
469
470 def _process_legacy_playlist(self, playlist_id):
471 return self._process_legacy_playlist_url(
472 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
473
474 def _download_legacy_playlist_url(self, url, playlist_id=None):
475 return self._download_xml(
476 url, playlist_id, 'Downloading legacy playlist XML')
ae6986fb 477
9afa1770 478 def _extract_from_legacy_playlist(self, playlist, playlist_id):
e6174ee9 479 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
ae6986fb
S
480 if no_items is not None:
481 reason = no_items.get('reason')
482 if reason == 'preAvailability':
483 msg = 'Episode %s is not yet available' % playlist_id
484 elif reason == 'postAvailability':
485 msg = 'Episode %s is no longer available' % playlist_id
486 elif reason == 'noMedia':
487 msg = 'Episode %s is not currently available' % playlist_id
488 else:
489 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
490 raise ExtractorError(msg, expected=True)
491
492 for item in self._extract_items(playlist):
493 kind = item.get('kind')
40fcba5e 494 if kind not in ('programme', 'radioProgramme'):
ae6986fb 495 continue
e6174ee9
S
496 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
497 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
8daeeedc 498 description = description_el.text if description_el is not None else None
9afa1770
S
499
500 def get_programme_id(item):
501 def get_from_attributes(item):
502 for p in('identifier', 'group'):
503 value = item.get(p)
504 if value and re.match(r'^[pb][\da-z]{7}$', value):
505 return value
506 get_from_attributes(item)
e6174ee9 507 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
9afa1770
S
508 if mediator is not None:
509 return get_from_attributes(mediator)
510
511 programme_id = get_programme_id(item)
d97f5cd7 512 duration = int_or_none(item.get('duration'))
e6174ee9
S
513
514 if programme_id:
515 formats, subtitles = self._download_media_selector(programme_id)
516 else:
517 formats, subtitles = self._process_media_selector(item, playlist_id)
518 programme_id = playlist_id
ae6986fb
S
519
520 return programme_id, title, description, duration, formats, subtitles
521
c056efa2
S
522 def _real_extract(self, url):
523 group_id = self._match_id(url)
524
525 webpage = self._download_webpage(url, group_id, 'Downloading video page')
526
b2ed954f
S
527 error = self._search_regex(
528 r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
529 webpage, 'error', default=None)
530 if error:
531 raise ExtractorError(error, expected=True)
532
8683b4d8 533 programme_id = None
679bacf0 534 duration = None
8683b4d8
S
535
536 tviplayer = self._search_regex(
537 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
538 webpage, 'player', default=None)
539
540 if tviplayer:
541 player = self._parse_json(tviplayer, group_id).get('player', {})
542 duration = int_or_none(player.get('duration'))
543 programme_id = player.get('vpid')
544
545 if not programme_id:
546 programme_id = self._search_regex(
22d7368d 547 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
8683b4d8 548
c056efa2 549 if programme_id:
c056efa2 550 formats, subtitles = self._download_media_selector(programme_id)
88fb59d9 551 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
50e989e2
S
552 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
553 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
8683b4d8 554 description = self._search_regex(
a8534274
S
555 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
556 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
679bacf0
YCH
557 webpage, 'description', default=None)
558 if not description:
559 description = self._html_search_meta('description', webpage)
c056efa2 560 else:
ae6986fb 561 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
2e3fd9ec 562
082c6c86
S
563 self._sort_formats(formats)
564
565 return {
2e3fd9ec 566 'id': programme_id,
082c6c86
S
567 'title': title,
568 'description': description,
650cfd0c 569 'thumbnail': self._og_search_thumbnail(webpage, default=None),
082c6c86
S
570 'duration': duration,
571 'formats': formats,
2e3fd9ec 572 'subtitles': subtitles,
5f6a1245 573 }
10273d6e 574
575
9afa1770
S
576class BBCIE(BBCCoUkIE):
577 IE_NAME = 'bbc'
578 IE_DESC = 'BBC'
579 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
10273d6e 580
d12a1a47 581 _MEDIASELECTOR_URLS = [
55ebae26
S
582 # Provides HQ HLS streams but fails with geolocation in some cases when it's
583 # even not geo restricted at all
584 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
d12a1a47
S
585 # Provides more formats, namely direct mp4 links, but fails on some videos with
586 # notukerror for non UK (?) users (e.g.
587 # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
588 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
589 # Provides fewer formats, but works everywhere for everybody (hopefully)
590 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
591 ]
10273d6e 592
593 _TESTS = [{
6a747190 594 # article with multiple videos embedded with data-playable containing vpids
10273d6e 595 'url': 'http://www.bbc.com/news/world-europe-32668511',
596 'info_dict': {
597 'id': 'world-europe-32668511',
598 'title': 'Russia stages massive WW2 parade despite Western boycott',
9afa1770 599 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
10273d6e 600 },
601 'playlist_count': 2,
a3bfddfa 602 }, {
6a747190 603 # article with multiple videos embedded with data-playable (more videos)
10273d6e 604 'url': 'http://www.bbc.com/news/business-28299555',
605 'info_dict': {
606 'id': 'business-28299555',
607 'title': 'Farnborough Airshow: Video highlights',
9afa1770 608 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
10273d6e 609 },
610 'playlist_count': 9,
9afa1770 611 'skip': 'Save time',
88ed52ae
S
612 }, {
613 # article with multiple videos embedded with `new SMP()`
6a747190 614 # broken
88ed52ae
S
615 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
616 'info_dict': {
617 'id': '3662a707-0af9-3149-963f-47bea720b460',
b7d7674f 618 'title': 'BUGGER',
88ed52ae
S
619 },
620 'playlist_count': 18,
a3bfddfa 621 }, {
6a747190 622 # single video embedded with data-playable containing vpid
10273d6e 623 'url': 'http://www.bbc.com/news/world-europe-32041533',
10273d6e 624 'info_dict': {
625 'id': 'p02mprgb',
55ebae26 626 'ext': 'mp4',
10273d6e 627 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
55ebae26 628 'description': 'md5:2868290467291b37feda7863f7a83f54',
10273d6e 629 'duration': 47,
9afa1770 630 'timestamp': 1427219242,
da92eeae 631 'upload_date': '20150324',
10273d6e 632 },
633 'params': {
9afa1770 634 # rtmp download
10273d6e 635 'skip_download': True,
636 }
a3bfddfa 637 }, {
6a747190
S
638 # article with single video embedded with data-playable containing XML playlist
639 # with direct video links as progressiveDownloadUrl (for now these are extracted)
640 # and playlist with f4m and m3u8 as streamingUrl
de939d89 641 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
de939d89 642 'info_dict': {
9afa1770 643 'id': '150615_telabyad_kentin_cogu',
de939d89 644 'ext': 'mp4',
ad152e2d 645 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
05087d1b 646 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
9afa1770 647 'timestamp': 1434397334,
da92eeae 648 'upload_date': '20150615',
de939d89 649 },
650 'params': {
651 'skip_download': True,
652 }
c936d8cc 653 }, {
6a747190 654 # single video embedded with data-playable containing XML playlists (regional section)
de939d89 655 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 656 'info_dict': {
9afa1770 657 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
de939d89 658 'ext': 'mp4',
9afa1770 659 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
05087d1b 660 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
9afa1770 661 'timestamp': 1434713142,
da92eeae 662 'upload_date': '20150619',
de939d89 663 },
664 'params': {
665 'skip_download': True,
666 }
a346b1ff
S
667 }, {
668 # single video from video playlist embedded with vxp-playlist-data JSON
669 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
670 'info_dict': {
671 'id': 'p02w6qjc',
55ebae26 672 'ext': 'mp4',
a346b1ff
S
673 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
674 'duration': 56,
0bc4ee60 675 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
a346b1ff
S
676 },
677 'params': {
678 'skip_download': True,
679 }
9afa1770
S
680 }, {
681 # single video story with digitalData
682 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
683 'info_dict': {
684 'id': 'p02q6gc4',
685 'ext': 'flv',
686 'title': 'Sri Lanka’s spicy secret',
687 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
688 'timestamp': 1437674293,
689 'upload_date': '20150723',
690 },
691 'params': {
692 # rtmp download
693 'skip_download': True,
694 }
695 }, {
696 # single video story without digitalData
697 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
698 'info_dict': {
699 'id': 'p018zqqg',
55ebae26 700 'ext': 'mp4',
9afa1770
S
701 'title': 'Hyundai Santa Fe Sport: Rock star',
702 'description': 'md5:b042a26142c4154a6e472933cf20793d',
ae8bdfd1
S
703 'timestamp': 1415867444,
704 'upload_date': '20141113',
9afa1770
S
705 },
706 'params': {
707 # rtmp download
708 'skip_download': True,
709 }
9fb64c04
S
710 }, {
711 # single video embedded with Morph
712 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
713 'info_dict': {
714 'id': 'p041vhd0',
715 'ext': 'mp4',
716 'title': "Nigeria v Japan - Men's First Round",
717 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
718 'duration': 7980,
719 'uploader': 'BBC Sport',
720 'uploader_id': 'bbc_sport',
721 },
722 'params': {
723 # m3u8 download
724 'skip_download': True,
9fb64c04
S
725 },
726 'skip': 'Georestricted to UK',
9afa1770 727 }, {
6a747190 728 # single video with playlist.sxml URL in playlist param
9afa1770
S
729 'url': 'http://www.bbc.com/sport/0/football/33653409',
730 'info_dict': {
731 'id': 'p02xycnp',
55ebae26 732 'ext': 'mp4',
9afa1770 733 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
7033bc1a 734 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
9afa1770
S
735 'duration': 140,
736 },
737 'params': {
738 # rtmp download
739 'skip_download': True,
740 }
b5d48cb1 741 }, {
6a747190 742 # article with multiple videos embedded with playlist.sxml in playlist param
b5d48cb1
S
743 'url': 'http://www.bbc.com/sport/0/football/34475836',
744 'info_dict': {
745 'id': '34475836',
450b233c 746 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
8c65e4a5 747 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
b5d48cb1
S
748 },
749 'playlist_count': 3,
450b233c
S
750 }, {
751 # school report article with single video
752 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
753 'info_dict': {
754 'id': '35744779',
755 'title': 'School which breaks down barriers in Jerusalem',
756 },
757 'playlist_count': 1,
9afa1770
S
758 }, {
759 # single video with playlist URL from weather section
760 'url': 'http://www.bbc.com/weather/features/33601775',
761 'only_matching': True,
762 }, {
763 # custom redirection to www.bbc.com
764 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
765 'only_matching': True,
a1cf3e38
S
766 }, {
767 # single video article embedded with data-media-vpid
768 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
769 'only_matching': True,
6d155707
S
770 }, {
771 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
772 'info_dict': {
773 'id': 'p06556y7',
774 'ext': 'mp4',
775 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
776 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
777 },
778 'params': {
779 'skip_download': True,
780 }
10273d6e 781 }]
782
9afa1770
S
783 @classmethod
784 def suitable(cls, url):
ded7511a
S
785 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
786 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
787 else super(BBCIE, cls).suitable(url))
9afa1770
S
788
789 def _extract_from_media_meta(self, media_meta, video_id):
790 # Direct links to media in media metadata (e.g.
791 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
792 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
793 source_files = media_meta.get('sourceFiles')
794 if source_files:
795 return [{
796 'url': f['url'],
797 'format_id': format_id,
798 'ext': f.get('encoding'),
799 'tbr': float_or_none(f.get('bitrate'), 1000),
800 'filesize': int_or_none(f.get('filesize')),
801 } for format_id, f in source_files.items() if f.get('url')], []
802
803 programme_id = media_meta.get('externalId')
804 if programme_id:
805 return self._download_media_selector(programme_id)
806
807 # Process playlist.sxml as legacy playlist
808 href = media_meta.get('href')
809 if href:
810 playlist = self._download_legacy_playlist_url(href)
811 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
812 return formats, subtitles
813
814 return [], []
815
baf39a1a
S
816 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
817 programme_id, title, description, duration, formats, subtitles = \
818 self._process_legacy_playlist_url(url, playlist_id)
819 self._sort_formats(formats)
820 return {
821 'id': programme_id,
822 'title': title,
823 'description': description,
824 'duration': duration,
825 'timestamp': timestamp,
826 'formats': formats,
827 'subtitles': subtitles,
828 }
829
10273d6e 830 def _real_extract(self, url):
9afa1770
S
831 playlist_id = self._match_id(url)
832
833 webpage = self._download_webpage(url, playlist_id)
834
522f6c06 835 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
350e02d4 836 timestamp = json_ld_info.get('timestamp')
0e832c2c 837
350e02d4 838 playlist_title = json_ld_info.get('title')
0e832c2c
S
839 if not playlist_title:
840 playlist_title = self._og_search_title(
841 webpage, default=None) or self._html_search_regex(
842 r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
843 if playlist_title:
844 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
845
846 playlist_description = json_ld_info.get(
847 'description') or self._og_search_description(webpage, default=None)
ae8bdfd1
S
848
849 if not timestamp:
850 timestamp = parse_iso8601(self._search_regex(
851 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
852 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
6f789365 853 r'"datePublished":\s*"([^"]+)'],
ae8bdfd1 854 webpage, 'date', default=None))
9afa1770 855
78f9d843
S
856 entries = []
857
de665713
S
858 # article with multiple videos embedded with playlist.sxml (e.g.
859 # http://www.bbc.com/sport/0/football/34475836)
860 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
222e11d4 861 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
de665713 862 if playlists:
baf39a1a
S
863 entries = [
864 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
865 for playlist_url in playlists]
de939d89 866
78f9d843
S
867 # news article with multiple videos embedded with data-playable
868 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
869 if data_playables:
870 for _, data_playable_json in data_playables:
871 data_playable = self._parse_json(
872 unescapeHTML(data_playable_json), playlist_id, fatal=False)
873 if not data_playable:
874 continue
baf39a1a
S
875 settings = data_playable.get('settings', {})
876 if settings:
78f9d843
S
877 # data-playable with video vpid in settings.playlistObject.items (e.g.
878 # http://www.bbc.com/news/world-us-canada-34473351)
baf39a1a
S
879 playlist_object = settings.get('playlistObject', {})
880 if playlist_object:
881 items = playlist_object.get('items')
882 if items and isinstance(items, list):
78f9d843
S
883 title = playlist_object['title']
884 description = playlist_object.get('summary')
baf39a1a
S
885 duration = int_or_none(items[0].get('duration'))
886 programme_id = items[0].get('vpid')
78f9d843
S
887 formats, subtitles = self._download_media_selector(programme_id)
888 self._sort_formats(formats)
889 entries.append({
890 'id': programme_id,
891 'title': title,
892 'description': description,
893 'timestamp': timestamp,
894 'duration': duration,
895 'formats': formats,
896 'subtitles': subtitles,
897 })
898 else:
899 # data-playable without vpid but with a playlist.sxml URLs
900 # in otherSettings.playlist (e.g.
901 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
902 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
903 if playlist:
a7e5f274
RA
904 entry = None
905 for key in ('streaming', 'progressiveDownload'):
05087d1b
S
906 playlist_url = playlist.get('%sUrl' % key)
907 if not playlist_url:
908 continue
909 try:
a7e5f274
RA
910 info = self._extract_from_playlist_sxml(
911 playlist_url, playlist_id, timestamp)
912 if not entry:
913 entry = info
914 else:
915 entry['title'] = info['title']
916 entry['formats'].extend(info['formats'])
05087d1b
S
917 except Exception as e:
918 # Some playlist URL may fail with 500, at the same time
919 # the other one may work fine (e.g.
920 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
921 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
922 continue
923 raise
a7e5f274
RA
924 if entry:
925 self._sort_formats(entry['formats'])
926 entries.append(entry)
78f9d843
S
927
928 if entries:
78f9d843
S
929 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
930
931 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
932 programme_id = self._search_regex(
a1cf3e38 933 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
22d7368d
S
934 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
935 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
78f9d843 936 webpage, 'vpid', default=None)
dab062fb 937
9afa1770
S
938 if programme_id:
939 formats, subtitles = self._download_media_selector(programme_id)
940 self._sort_formats(formats)
941 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
942 digital_data = self._parse_json(
943 self._search_regex(
944 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
945 programme_id, fatal=False)
946 page_info = digital_data.get('page', {}).get('pageInfo', {})
947 title = page_info.get('pageName') or self._og_search_title(webpage)
948 description = page_info.get('description') or self._og_search_description(webpage)
949 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
950 return {
951 'id': programme_id,
952 'title': title,
953 'description': description,
954 'timestamp': timestamp,
955 'formats': formats,
956 'subtitles': subtitles,
957 }
a3bfddfa 958
9fb64c04
S
959 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
960 # There are several setPayload calls may be present but the video
961 # seems to be always related to the first one
962 morph_payload = self._parse_json(
963 self._search_regex(
964 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
965 webpage, 'morph payload', default='{}'),
966 playlist_id, fatal=False)
967 if morph_payload:
968 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
969 for component in components:
970 if not isinstance(component, dict):
971 continue
972 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
973 if not lead_media:
974 continue
975 identifiers = lead_media.get('identifiers')
976 if not identifiers or not isinstance(identifiers, dict):
977 continue
978 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
979 if not programme_id:
980 continue
981 title = lead_media.get('title') or self._og_search_title(webpage)
982 formats, subtitles = self._download_media_selector(programme_id)
983 self._sort_formats(formats)
984 description = lead_media.get('summary')
985 uploader = lead_media.get('masterBrand')
986 uploader_id = lead_media.get('mid')
987 duration = None
988 duration_d = lead_media.get('duration')
989 if isinstance(duration_d, dict):
990 duration = parse_duration(dict_get(
991 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
992 return {
993 'id': programme_id,
994 'title': title,
995 'description': description,
996 'duration': duration,
997 'uploader': uploader,
998 'uploader_id': uploader_id,
999 'formats': formats,
1000 'subtitles': subtitles,
1001 }
1002
6d155707
S
1003 bbc3_config = self._parse_json(
1004 self._search_regex(
1005 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1006 'bbcthree config', default='{}'),
1007 playlist_id, transform_source=js_to_json, fatal=False)
1008 if bbc3_config:
1009 bbc3_playlist = try_get(
1010 bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
1011 dict)
1012 if bbc3_playlist:
1013 playlist_title = bbc3_playlist.get('title') or playlist_title
1014 thumbnail = bbc3_playlist.get('holdingImageURL')
1015 entries = []
1016 for bbc3_item in bbc3_playlist['items']:
1017 programme_id = bbc3_item.get('versionID')
1018 if not programme_id:
1019 continue
1020 formats, subtitles = self._download_media_selector(programme_id)
1021 self._sort_formats(formats)
1022 entries.append({
1023 'id': programme_id,
1024 'title': playlist_title,
1025 'thumbnail': thumbnail,
1026 'timestamp': timestamp,
1027 'formats': formats,
1028 'subtitles': subtitles,
1029 })
1030 return self.playlist_result(
1031 entries, playlist_id, playlist_title, playlist_description)
1032
88ed52ae
S
1033 def extract_all(pattern):
1034 return list(filter(None, map(
1035 lambda s: self._parse_json(s, playlist_id, fatal=False),
1036 re.findall(pattern, webpage))))
1037
1038 # Multiple video article (e.g.
1039 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
22d7368d 1040 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
88ed52ae
S
1041 entries = []
1042 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1043 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1044 if embed_url and re.match(EMBED_URL, embed_url):
1045 entries.append(embed_url)
1046 entries.extend(re.findall(
1047 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1048 if entries:
1049 return self.playlist_result(
aaa42cf0 1050 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
88ed52ae 1051 playlist_id, playlist_title, playlist_description)
9afa1770
S
1052
1053 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
88ed52ae 1054 medias = extract_all(r"data-media-meta='({[^']+})'")
9afa1770
S
1055
1056 if not medias:
1057 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
a346b1ff
S
1058 media_asset = self._search_regex(
1059 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1060 webpage, 'media asset', default=None)
1061 if media_asset:
1062 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1063 medias = []
1064 for video in media_asset_page.get('videos', {}).values():
1065 medias.extend(video.values())
1066
1067 if not medias:
1068 # Multiple video playlist with single `now playing` entry (e.g.
1069 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1070 vxp_playlist = self._parse_json(
9afa1770 1071 self._search_regex(
a346b1ff
S
1072 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1073 webpage, 'playlist data'),
9afa1770 1074 playlist_id)
a346b1ff
S
1075 playlist_medias = []
1076 for item in vxp_playlist:
1077 media = item.get('media')
1078 if not media:
1079 continue
1080 playlist_medias.append(media)
1081 # Download single video if found media with asset id matching the video id from URL
1082 if item.get('advert', {}).get('assetId') == playlist_id:
1083 medias = [media]
1084 break
1085 # Fallback to the whole playlist
1086 if not medias:
1087 medias = playlist_medias
9afa1770
S
1088
1089 entries = []
1090 for num, media_meta in enumerate(medias, start=1):
1091 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1092 if not formats:
1093 continue
10273d6e 1094 self._sort_formats(formats)
1095
9afa1770
S
1096 video_id = media_meta.get('externalId')
1097 if not video_id:
1098 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1099
1100 title = media_meta.get('caption')
1101 if not title:
1102 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1103
1104 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
da92eeae 1105
9afa1770
S
1106 images = []
1107 for image in media_meta.get('images', {}).values():
1108 images.extend(image.values())
1109 if 'image' in media_meta:
1110 images.append(media_meta['image'])
1111
1112 thumbnails = [{
1113 'url': image.get('href'),
1114 'width': int_or_none(image.get('width')),
1115 'height': int_or_none(image.get('height')),
1116 } for image in images]
1117
1118 entries.append({
1119 'id': video_id,
10273d6e 1120 'title': title,
9afa1770 1121 'thumbnails': thumbnails,
10273d6e 1122 'duration': duration,
9afa1770 1123 'timestamp': timestamp,
10273d6e 1124 'formats': formats,
1125 'subtitles': subtitles,
a3bfddfa 1126 })
10273d6e 1127
9afa1770 1128 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
a65402ef
YCH
1129
1130
1131class BBCCoUkArticleIE(InfoExtractor):
92519402 1132 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
a65402ef
YCH
1133 IE_NAME = 'bbc.co.uk:article'
1134 IE_DESC = 'BBC articles'
1135
1136 _TEST = {
1137 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1138 'info_dict': {
1139 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1140 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1141 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1142 },
1143 'playlist_count': 4,
1144 'add_ie': ['BBCCoUk'],
1145 }
1146
1147 def _real_extract(self, url):
1148 playlist_id = self._match_id(url)
1149
1150 webpage = self._download_webpage(url, playlist_id)
1151
1152 title = self._og_search_title(webpage)
1153 description = self._og_search_description(webpage).strip()
1154
1155 entries = [self.url_result(programme_url) for programme_url in re.findall(
1156 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1157
1158 return self.playlist_result(entries, playlist_id, title, description)
ded7511a
S
1159
1160
1161class BBCCoUkPlaylistBaseIE(InfoExtractor):
254e64a2
S
1162 def _entries(self, webpage, url, playlist_id):
1163 single_page = 'page' in compat_urlparse.parse_qs(
1164 compat_urlparse.urlparse(url).query)
1165 for page_num in itertools.count(2):
1166 for video_id in re.findall(
1167 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1168 yield self.url_result(
1169 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1170 if single_page:
1171 return
1172 next_page = self._search_regex(
1173 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1174 webpage, 'next page url', default=None, group='url')
1175 if not next_page:
1176 break
1177 webpage = self._download_webpage(
1178 compat_urlparse.urljoin(url, next_page), playlist_id,
1179 'Downloading page %d' % page_num, page_num)
1180
ded7511a
S
1181 def _real_extract(self, url):
1182 playlist_id = self._match_id(url)
1183
1184 webpage = self._download_webpage(url, playlist_id)
1185
ded7511a
S
1186 title, description = self._extract_title_and_description(webpage)
1187
254e64a2
S
1188 return self.playlist_result(
1189 self._entries(webpage, url, playlist_id),
1190 playlist_id, title, description)
ded7511a
S
1191
1192
1193class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
1194 IE_NAME = 'bbc.co.uk:iplayer:playlist'
9158af16 1195 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
ded7511a
S
1196 _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
1197 _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
9158af16 1198 _TESTS = [{
ded7511a
S
1199 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1200 'info_dict': {
1201 'id': 'b05rcz9v',
1202 'title': 'The Disappearance',
1203 'description': 'French thriller serial about a missing teenager.',
1204 },
1205 'playlist_mincount': 6,
c6668e4a 1206 'skip': 'This programme is not currently available on BBC iPlayer',
9158af16
S
1207 }, {
1208 # Available for over a year unlike 30 days for most other programmes
1209 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1210 'info_dict': {
1211 'id': 'p02tcc32',
1212 'title': 'Bohemian Icons',
1213 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1214 },
1215 'playlist_mincount': 10,
1216 }]
ded7511a
S
1217
1218 def _extract_title_and_description(self, webpage):
1219 title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
1220 description = self._search_regex(
1221 r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
1222 webpage, 'description', fatal=False, group='value')
1223 return title, description
1224
1225
1226class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1227 IE_NAME = 'bbc.co.uk:playlist'
1228 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1229 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1230 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1231 _TESTS = [{
1232 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1233 'info_dict': {
1234 'id': 'b05rcz9v',
1235 'title': 'The Disappearance - Clips - BBC Four',
1236 'description': 'French thriller serial about a missing teenager.',
1237 },
1238 'playlist_mincount': 7,
4f640f28
S
1239 }, {
1240 # multipage playlist, explicit page
1241 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1242 'info_dict': {
1243 'id': 'b00mfl7n',
1244 'title': 'Frozen Planet - Clips - BBC One',
1245 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1246 },
1247 'playlist_mincount': 24,
1248 }, {
1249 # multipage playlist, all pages
1250 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1251 'info_dict': {
1252 'id': 'b00mfl7n',
1253 'title': 'Frozen Planet - Clips - BBC One',
1254 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1255 },
1256 'playlist_mincount': 142,
ded7511a
S
1257 }, {
1258 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1259 'only_matching': True,
1260 }, {
1261 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1262 'only_matching': True,
1263 }, {
1264 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1265 'only_matching': True,
1266 }]
1267
1268 def _extract_title_and_description(self, webpage):
1269 title = self._og_search_title(webpage, fatal=False)
1270 description = self._og_search_description(webpage)
1271 return title, description