]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/bbc.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
1 import functools
2 import itertools
3 import json
4 import re
5 import urllib.error
6 import xml.etree.ElementTree
7
8 from .common import InfoExtractor
9 from ..compat import compat_HTTPError, compat_str, compat_urlparse
10 from ..utils import (
11 ExtractorError,
12 OnDemandPagedList,
13 clean_html,
14 dict_get,
15 float_or_none,
16 get_element_by_class,
17 int_or_none,
18 js_to_json,
19 parse_duration,
20 parse_iso8601,
21 parse_qs,
22 strip_or_none,
23 try_get,
24 unescapeHTML,
25 unified_timestamp,
26 url_or_none,
27 urlencode_postdata,
28 urljoin,
29 )
30
31
32 class BBCCoUkIE(InfoExtractor):
33 IE_NAME = 'bbc.co.uk'
34 IE_DESC = 'BBC iPlayer'
35 _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
36 _VALID_URL = r'''(?x)
37 https?://
38 (?:www\.)?bbc\.co\.uk/
39 (?:
40 programmes/(?!articles/)|
41 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
42 music/(?:clips|audiovideo/popular)[/#]|
43 radio/player/|
44 sounds/play/|
45 events/[^/]+/play/[^/]+/
46 )
47 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
48 ''' % _ID_REGEX
49 _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
50
51 _LOGIN_URL = 'https://account.bbc.com/signin'
52 _NETRC_MACHINE = 'bbc'
53
54 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
55 _MEDIA_SETS = [
56 # Provides HQ HLS streams with even better quality that pc mediaset but fails
57 # with geolocation in some cases when it's even not geo restricted at all (e.g.
58 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
59 'iptv-all',
60 'pc',
61 ]
62
63 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
64
65 _TESTS = [
66 {
67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
68 'info_dict': {
69 'id': 'b039d07m',
70 'ext': 'flv',
71 'title': 'Kaleidoscope, Leonard Cohen',
72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
73 },
74 'params': {
75 # rtmp download
76 'skip_download': True,
77 }
78 },
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
101 'duration': 5100,
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
137 }, {
138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
139 'note': 'Audio',
140 'info_dict': {
141 'id': 'p022h44j',
142 'ext': 'flv',
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
146 },
147 'params': {
148 # rtmp download
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
156 'ext': 'flv',
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
162 # rtmp download
163 'skip_download': True,
164 }
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
178 'skip': 'geolocation',
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
192 'skip': 'geolocation',
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
197 'info_dict': {
198 'id': 'b06rkms3',
199 'ext': 'flv',
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
207 'skip': 'Now it\'s really geo-restricted',
208 }, {
209 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
213 'ext': 'flv',
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
218 # rtmp download
219 'skip_download': True,
220 },
221 }, {
222 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
223 'note': 'Audio',
224 'info_dict': {
225 'id': 'm0007jz9',
226 'ext': 'mp4',
227 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
228 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
229 'duration': 9840,
230 },
231 'params': {
232 # rtmp download
233 'skip_download': True,
234 }
235 }, {
236 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
237 'only_matching': True,
238 }, {
239 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
240 'only_matching': True,
241 }, {
242 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
243 'only_matching': True,
244 }, {
245 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
246 'only_matching': True,
247 }, {
248 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
249 'only_matching': True,
250 }, {
251 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
252 'only_matching': True,
253 }, {
254 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
255 'only_matching': True,
256 }, {
257 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
258 'only_matching': True,
259 }]
260
261 def _perform_login(self, username, password):
262 login_page = self._download_webpage(
263 self._LOGIN_URL, None, 'Downloading signin page')
264
265 login_form = self._hidden_inputs(login_page)
266
267 login_form.update({
268 'username': username,
269 'password': password,
270 })
271
272 post_url = urljoin(self._LOGIN_URL, self._search_regex(
273 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
274 'post url', default=self._LOGIN_URL, group='url'))
275
276 response, urlh = self._download_webpage_handle(
277 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
278 headers={'Referer': self._LOGIN_URL})
279
280 if self._LOGIN_URL in urlh.geturl():
281 error = clean_html(get_element_by_class('form-message', response))
282 if error:
283 raise ExtractorError(
284 'Unable to login: %s' % error, expected=True)
285 raise ExtractorError('Unable to log in')
286
287 class MediaSelectionError(Exception):
288 def __init__(self, id):
289 self.id = id
290
291 def _extract_asx_playlist(self, connection, programme_id):
292 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
293 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
294
295 def _extract_items(self, playlist):
296 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
297
298 def _extract_medias(self, media_selection):
299 error = media_selection.get('result')
300 if error:
301 raise BBCCoUkIE.MediaSelectionError(error)
302 return media_selection.get('media') or []
303
304 def _extract_connections(self, media):
305 return media.get('connection') or []
306
307 def _get_subtitles(self, media, programme_id):
308 subtitles = {}
309 for connection in self._extract_connections(media):
310 cc_url = url_or_none(connection.get('href'))
311 if not cc_url:
312 continue
313 captions = self._download_xml(
314 cc_url, programme_id, 'Downloading captions', fatal=False)
315 if not isinstance(captions, xml.etree.ElementTree.Element):
316 continue
317 subtitles['en'] = [
318 {
319 'url': connection.get('href'),
320 'ext': 'ttml',
321 },
322 ]
323 break
324 return subtitles
325
326 def _raise_extractor_error(self, media_selection_error):
327 raise ExtractorError(
328 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
329 expected=True)
330
331 def _download_media_selector(self, programme_id):
332 last_exception = None
333 for media_set in self._MEDIA_SETS:
334 try:
335 return self._download_media_selector_url(
336 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
337 except BBCCoUkIE.MediaSelectionError as e:
338 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
339 last_exception = e
340 continue
341 self._raise_extractor_error(e)
342 self._raise_extractor_error(last_exception)
343
344 def _download_media_selector_url(self, url, programme_id=None):
345 media_selection = self._download_json(
346 url, programme_id, 'Downloading media selection JSON',
347 expected_status=(403, 404))
348 return self._process_media_selector(media_selection, programme_id)
349
350 def _process_media_selector(self, media_selection, programme_id):
351 formats = []
352 subtitles = None
353 urls = []
354
355 for media in self._extract_medias(media_selection):
356 kind = media.get('kind')
357 if kind in ('video', 'audio'):
358 bitrate = int_or_none(media.get('bitrate'))
359 encoding = media.get('encoding')
360 width = int_or_none(media.get('width'))
361 height = int_or_none(media.get('height'))
362 file_size = int_or_none(media.get('media_file_size'))
363 for connection in self._extract_connections(media):
364 href = connection.get('href')
365 if href in urls:
366 continue
367 if href:
368 urls.append(href)
369 conn_kind = connection.get('kind')
370 protocol = connection.get('protocol')
371 supplier = connection.get('supplier')
372 transfer_format = connection.get('transferFormat')
373 format_id = supplier or conn_kind or protocol
374 # ASX playlist
375 if supplier == 'asx':
376 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
377 formats.append({
378 'url': ref,
379 'format_id': 'ref%s_%s' % (i, format_id),
380 })
381 elif transfer_format == 'dash':
382 formats.extend(self._extract_mpd_formats(
383 href, programme_id, mpd_id=format_id, fatal=False))
384 elif transfer_format == 'hls':
385 # TODO: let expected_status be passed into _extract_xxx_formats() instead
386 try:
387 fmts = self._extract_m3u8_formats(
388 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
389 m3u8_id=format_id, fatal=False)
390 except ExtractorError as e:
391 if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
392 and e.exc_info[1].code in (403, 404)):
393 raise
394 fmts = []
395 formats.extend(fmts)
396 elif transfer_format == 'hds':
397 formats.extend(self._extract_f4m_formats(
398 href, programme_id, f4m_id=format_id, fatal=False))
399 else:
400 if not supplier and bitrate:
401 format_id += '-%d' % bitrate
402 fmt = {
403 'format_id': format_id,
404 'filesize': file_size,
405 }
406 if kind == 'video':
407 fmt.update({
408 'width': width,
409 'height': height,
410 'tbr': bitrate,
411 'vcodec': encoding,
412 })
413 else:
414 fmt.update({
415 'abr': bitrate,
416 'acodec': encoding,
417 'vcodec': 'none',
418 })
419 if protocol in ('http', 'https'):
420 # Direct link
421 fmt.update({
422 'url': href,
423 })
424 elif protocol == 'rtmp':
425 application = connection.get('application', 'ondemand')
426 auth_string = connection.get('authString')
427 identifier = connection.get('identifier')
428 server = connection.get('server')
429 fmt.update({
430 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
431 'play_path': identifier,
432 'app': '%s?%s' % (application, auth_string),
433 'page_url': 'http://www.bbc.co.uk',
434 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
435 'rtmp_live': False,
436 'ext': 'flv',
437 })
438 else:
439 continue
440 formats.append(fmt)
441 elif kind == 'captions':
442 subtitles = self.extract_subtitles(media, programme_id)
443 return formats, subtitles
444
445 def _download_playlist(self, playlist_id):
446 try:
447 playlist = self._download_json(
448 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
449 playlist_id, 'Downloading playlist JSON')
450 formats = []
451 subtitles = {}
452
453 for version in playlist.get('allAvailableVersions', []):
454 smp_config = version['smpConfig']
455 title = smp_config['title']
456 description = smp_config['summary']
457 for item in smp_config['items']:
458 kind = item['kind']
459 if kind not in ('programme', 'radioProgramme'):
460 continue
461 programme_id = item.get('vpid')
462 duration = int_or_none(item.get('duration'))
463 version_formats, version_subtitles = self._download_media_selector(programme_id)
464 types = version['types']
465 for f in version_formats:
466 f['format_note'] = ', '.join(types)
467 if any('AudioDescribed' in x for x in types):
468 f['language_preference'] = -10
469 formats += version_formats
470 for tag, subformats in (version_subtitles or {}).items():
471 subtitles.setdefault(tag, []).extend(subformats)
472
473 return programme_id, title, description, duration, formats, subtitles
474 except ExtractorError as ee:
475 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
476 raise
477
478 # fallback to legacy playlist
479 return self._process_legacy_playlist(playlist_id)
480
481 def _process_legacy_playlist_url(self, url, display_id):
482 playlist = self._download_legacy_playlist_url(url, display_id)
483 return self._extract_from_legacy_playlist(playlist, display_id)
484
485 def _process_legacy_playlist(self, playlist_id):
486 return self._process_legacy_playlist_url(
487 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
488
489 def _download_legacy_playlist_url(self, url, playlist_id=None):
490 return self._download_xml(
491 url, playlist_id, 'Downloading legacy playlist XML')
492
493 def _extract_from_legacy_playlist(self, playlist, playlist_id):
494 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
495 if no_items is not None:
496 reason = no_items.get('reason')
497 if reason == 'preAvailability':
498 msg = 'Episode %s is not yet available' % playlist_id
499 elif reason == 'postAvailability':
500 msg = 'Episode %s is no longer available' % playlist_id
501 elif reason == 'noMedia':
502 msg = 'Episode %s is not currently available' % playlist_id
503 else:
504 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
505 raise ExtractorError(msg, expected=True)
506
507 for item in self._extract_items(playlist):
508 kind = item.get('kind')
509 if kind not in ('programme', 'radioProgramme'):
510 continue
511 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
512 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
513 description = description_el.text if description_el is not None else None
514
515 def get_programme_id(item):
516 def get_from_attributes(item):
517 for p in ('identifier', 'group'):
518 value = item.get(p)
519 if value and re.match(r'^[pb][\da-z]{7}$', value):
520 return value
521 get_from_attributes(item)
522 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
523 if mediator is not None:
524 return get_from_attributes(mediator)
525
526 programme_id = get_programme_id(item)
527 duration = int_or_none(item.get('duration'))
528
529 if programme_id:
530 formats, subtitles = self._download_media_selector(programme_id)
531 else:
532 formats, subtitles = self._process_media_selector(item, playlist_id)
533 programme_id = playlist_id
534
535 return programme_id, title, description, duration, formats, subtitles
536
537 def _real_extract(self, url):
538 group_id = self._match_id(url)
539
540 webpage = self._download_webpage(url, group_id, 'Downloading video page')
541
542 error = self._search_regex(
543 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
544 webpage, 'error', default=None)
545 if error:
546 raise ExtractorError(error, expected=True)
547
548 programme_id = None
549 duration = None
550
551 tviplayer = self._search_regex(
552 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
553 webpage, 'player', default=None)
554
555 if tviplayer:
556 player = self._parse_json(tviplayer, group_id).get('player', {})
557 duration = int_or_none(player.get('duration'))
558 programme_id = player.get('vpid')
559
560 if not programme_id:
561 programme_id = self._search_regex(
562 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
563
564 if programme_id:
565 formats, subtitles = self._download_media_selector(programme_id)
566 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
567 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
568 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
569 description = self._search_regex(
570 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
571 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
572 webpage, 'description', default=None)
573 if not description:
574 description = self._html_search_meta('description', webpage)
575 else:
576 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
577
578 self._sort_formats(formats)
579
580 return {
581 'id': programme_id,
582 'title': title,
583 'description': description,
584 'thumbnail': self._og_search_thumbnail(webpage, default=None),
585 'duration': duration,
586 'formats': formats,
587 'subtitles': subtitles,
588 }
589
590
591 class BBCIE(BBCCoUkIE):
592 IE_NAME = 'bbc'
593 IE_DESC = 'BBC'
594 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
595
596 _MEDIA_SETS = [
597 'pc',
598 'mobile-tablet-main',
599 ]
600
601 _TESTS = [{
602 # article with multiple videos embedded with data-playable containing vpids
603 'url': 'http://www.bbc.com/news/world-europe-32668511',
604 'info_dict': {
605 'id': 'world-europe-32668511',
606 'title': 'Russia stages massive WW2 parade',
607 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
608 },
609 'playlist_count': 2,
610 }, {
611 # article with multiple videos embedded with data-playable (more videos)
612 'url': 'http://www.bbc.com/news/business-28299555',
613 'info_dict': {
614 'id': 'business-28299555',
615 'title': 'Farnborough Airshow: Video highlights',
616 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
617 },
618 'playlist_count': 9,
619 'skip': 'Save time',
620 }, {
621 # article with multiple videos embedded with `new SMP()`
622 # broken
623 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
624 'info_dict': {
625 'id': '3662a707-0af9-3149-963f-47bea720b460',
626 'title': 'BUGGER',
627 },
628 'playlist_count': 18,
629 }, {
630 # single video embedded with data-playable containing vpid
631 'url': 'http://www.bbc.com/news/world-europe-32041533',
632 'info_dict': {
633 'id': 'p02mprgb',
634 'ext': 'mp4',
635 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
636 'description': 'md5:2868290467291b37feda7863f7a83f54',
637 'duration': 47,
638 'timestamp': 1427219242,
639 'upload_date': '20150324',
640 },
641 'params': {
642 # rtmp download
643 'skip_download': True,
644 }
645 }, {
646 # article with single video embedded with data-playable containing XML playlist
647 # with direct video links as progressiveDownloadUrl (for now these are extracted)
648 # and playlist with f4m and m3u8 as streamingUrl
649 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
650 'info_dict': {
651 'id': '150615_telabyad_kentin_cogu',
652 'ext': 'mp4',
653 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
654 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
655 'timestamp': 1434397334,
656 'upload_date': '20150615',
657 },
658 'params': {
659 'skip_download': True,
660 }
661 }, {
662 # single video embedded with data-playable containing XML playlists (regional section)
663 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
664 'info_dict': {
665 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
666 'ext': 'mp4',
667 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
668 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
669 'timestamp': 1434713142,
670 'upload_date': '20150619',
671 },
672 'params': {
673 'skip_download': True,
674 }
675 }, {
676 # single video from video playlist embedded with vxp-playlist-data JSON
677 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
678 'info_dict': {
679 'id': 'p02w6qjc',
680 'ext': 'mp4',
681 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
682 'duration': 56,
683 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
684 },
685 'params': {
686 'skip_download': True,
687 }
688 }, {
689 # single video story with digitalData
690 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
691 'info_dict': {
692 'id': 'p02q6gc4',
693 'ext': 'flv',
694 'title': 'Sri Lanka’s spicy secret',
695 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
696 'timestamp': 1437674293,
697 'upload_date': '20150723',
698 },
699 'params': {
700 # rtmp download
701 'skip_download': True,
702 }
703 }, {
704 # single video story without digitalData
705 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
706 'info_dict': {
707 'id': 'p018zqqg',
708 'ext': 'mp4',
709 'title': 'Hyundai Santa Fe Sport: Rock star',
710 'description': 'md5:b042a26142c4154a6e472933cf20793d',
711 'timestamp': 1415867444,
712 'upload_date': '20141113',
713 },
714 'params': {
715 # rtmp download
716 'skip_download': True,
717 }
718 }, {
719 # single video embedded with Morph
720 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
721 'info_dict': {
722 'id': 'p041vhd0',
723 'ext': 'mp4',
724 'title': "Nigeria v Japan - Men's First Round",
725 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
726 'duration': 7980,
727 'uploader': 'BBC Sport',
728 'uploader_id': 'bbc_sport',
729 },
730 'params': {
731 # m3u8 download
732 'skip_download': True,
733 },
734 'skip': 'Georestricted to UK',
735 }, {
736 # single video with playlist.sxml URL in playlist param
737 'url': 'http://www.bbc.com/sport/0/football/33653409',
738 'info_dict': {
739 'id': 'p02xycnp',
740 'ext': 'mp4',
741 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
742 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
743 'duration': 140,
744 },
745 'params': {
746 # rtmp download
747 'skip_download': True,
748 }
749 }, {
750 # article with multiple videos embedded with playlist.sxml in playlist param
751 'url': 'http://www.bbc.com/sport/0/football/34475836',
752 'info_dict': {
753 'id': '34475836',
754 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
755 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
756 },
757 'playlist_count': 3,
758 }, {
759 # school report article with single video
760 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
761 'info_dict': {
762 'id': '35744779',
763 'title': 'School which breaks down barriers in Jerusalem',
764 },
765 'playlist_count': 1,
766 }, {
767 # single video with playlist URL from weather section
768 'url': 'http://www.bbc.com/weather/features/33601775',
769 'only_matching': True,
770 }, {
771 # custom redirection to www.bbc.com
772 # also, video with window.__INITIAL_DATA__
773 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
774 'info_dict': {
775 'id': 'p02xzws1',
776 'ext': 'mp4',
777 'title': "Pluto may have 'nitrogen glaciers'",
778 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
779 'thumbnail': r're:https?://.+/.+\.jpg',
780 'timestamp': 1437785037,
781 'upload_date': '20150725',
782 },
783 }, {
784 # video with window.__INITIAL_DATA__ and value as JSON string
785 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
786 'info_dict': {
787 'id': 'p0b71qth',
788 'ext': 'mp4',
789 'title': 'Why France is making this woman a national hero',
790 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
791 'thumbnail': r're:https?://.+/.+\.jpg',
792 'timestamp': 1638230731,
793 'upload_date': '20211130',
794 },
795 }, {
796 # single video article embedded with data-media-vpid
797 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
798 'only_matching': True,
799 }, {
800 # bbcthreeConfig
801 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
802 'info_dict': {
803 'id': 'p06556y7',
804 'ext': 'mp4',
805 'title': 'Things Not To Say to people that live on council estates',
806 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
807 'duration': 360,
808 'thumbnail': r're:https?://.+/.+\.jpg',
809 },
810 }, {
811 # window.__PRELOADED_STATE__
812 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
813 'info_dict': {
814 'id': 'b0b9z4vz',
815 'ext': 'mp4',
816 'title': 'Prom 6: An American in Paris and Turangalila',
817 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
818 'uploader': 'Radio 3',
819 'uploader_id': 'bbc_radio_three',
820 },
821 }, {
822 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
823 'info_dict': {
824 'id': 'p06w9tws',
825 'ext': 'mp4',
826 'title': 'md5:2fabf12a726603193a2879a055f72514',
827 'description': 'Learn English words and phrases from this story',
828 },
829 'add_ie': [BBCCoUkIE.ie_key()],
830 }, {
831 # BBC Reel
832 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
833 'info_dict': {
834 'id': 'p07c6sb9',
835 'ext': 'mp4',
836 'title': 'How positive thinking is harming your happiness',
837 'alt_title': 'The downsides of positive thinking',
838 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
839 'duration': 235,
840 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
841 'upload_date': '20190604',
842 'categories': ['Psychology'],
843 },
844 }]
845
846 @classmethod
847 def suitable(cls, url):
848 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
849 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
850 else super(BBCIE, cls).suitable(url))
851
852 def _extract_from_media_meta(self, media_meta, video_id):
853 # Direct links to media in media metadata (e.g.
854 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
855 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
856 source_files = media_meta.get('sourceFiles')
857 if source_files:
858 return [{
859 'url': f['url'],
860 'format_id': format_id,
861 'ext': f.get('encoding'),
862 'tbr': float_or_none(f.get('bitrate'), 1000),
863 'filesize': int_or_none(f.get('filesize')),
864 } for format_id, f in source_files.items() if f.get('url')], []
865
866 programme_id = media_meta.get('externalId')
867 if programme_id:
868 return self._download_media_selector(programme_id)
869
870 # Process playlist.sxml as legacy playlist
871 href = media_meta.get('href')
872 if href:
873 playlist = self._download_legacy_playlist_url(href)
874 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
875 return formats, subtitles
876
877 return [], []
878
879 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
880 programme_id, title, description, duration, formats, subtitles = \
881 self._process_legacy_playlist_url(url, playlist_id)
882 self._sort_formats(formats)
883 return {
884 'id': programme_id,
885 'title': title,
886 'description': description,
887 'duration': duration,
888 'timestamp': timestamp,
889 'formats': formats,
890 'subtitles': subtitles,
891 }
892
893 def _real_extract(self, url):
894 playlist_id = self._match_id(url)
895
896 webpage = self._download_webpage(url, playlist_id)
897
898 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
899 timestamp = json_ld_info.get('timestamp')
900
901 playlist_title = json_ld_info.get('title')
902 if not playlist_title:
903 playlist_title = (self._og_search_title(webpage, default=None)
904 or self._html_extract_title(webpage, 'playlist title', default=None))
905 if playlist_title:
906 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
907
908 playlist_description = json_ld_info.get(
909 'description') or self._og_search_description(webpage, default=None)
910
911 if not timestamp:
912 timestamp = parse_iso8601(self._search_regex(
913 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
914 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
915 r'"datePublished":\s*"([^"]+)'],
916 webpage, 'date', default=None))
917
918 entries = []
919
920 # article with multiple videos embedded with playlist.sxml (e.g.
921 # http://www.bbc.com/sport/0/football/34475836)
922 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
923 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
924 if playlists:
925 entries = [
926 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
927 for playlist_url in playlists]
928
929 # news article with multiple videos embedded with data-playable
930 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
931 if data_playables:
932 for _, data_playable_json in data_playables:
933 data_playable = self._parse_json(
934 unescapeHTML(data_playable_json), playlist_id, fatal=False)
935 if not data_playable:
936 continue
937 settings = data_playable.get('settings', {})
938 if settings:
939 # data-playable with video vpid in settings.playlistObject.items (e.g.
940 # http://www.bbc.com/news/world-us-canada-34473351)
941 playlist_object = settings.get('playlistObject', {})
942 if playlist_object:
943 items = playlist_object.get('items')
944 if items and isinstance(items, list):
945 title = playlist_object['title']
946 description = playlist_object.get('summary')
947 duration = int_or_none(items[0].get('duration'))
948 programme_id = items[0].get('vpid')
949 formats, subtitles = self._download_media_selector(programme_id)
950 self._sort_formats(formats)
951 entries.append({
952 'id': programme_id,
953 'title': title,
954 'description': description,
955 'timestamp': timestamp,
956 'duration': duration,
957 'formats': formats,
958 'subtitles': subtitles,
959 })
960 else:
961 # data-playable without vpid but with a playlist.sxml URLs
962 # in otherSettings.playlist (e.g.
963 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
964 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
965 if playlist:
966 entry = None
967 for key in ('streaming', 'progressiveDownload'):
968 playlist_url = playlist.get('%sUrl' % key)
969 if not playlist_url:
970 continue
971 try:
972 info = self._extract_from_playlist_sxml(
973 playlist_url, playlist_id, timestamp)
974 if not entry:
975 entry = info
976 else:
977 entry['title'] = info['title']
978 entry['formats'].extend(info['formats'])
979 except ExtractorError as e:
980 # Some playlist URL may fail with 500, at the same time
981 # the other one may work fine (e.g.
982 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
983 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
984 continue
985 raise
986 if entry:
987 self._sort_formats(entry['formats'])
988 entries.append(entry)
989
990 if entries:
991 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
992
993 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
994 group_id = self._search_regex(
995 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
996 webpage, 'group id', default=None)
997 if group_id:
998 return self.url_result(
999 'https://www.bbc.co.uk/programmes/%s' % group_id,
1000 ie=BBCCoUkIE.ie_key())
1001
1002 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1003 programme_id = self._search_regex(
1004 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1005 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1006 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1007 webpage, 'vpid', default=None)
1008
1009 if programme_id:
1010 formats, subtitles = self._download_media_selector(programme_id)
1011 self._sort_formats(formats)
1012 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1013 digital_data = self._parse_json(
1014 self._search_regex(
1015 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1016 programme_id, fatal=False)
1017 page_info = digital_data.get('page', {}).get('pageInfo', {})
1018 title = page_info.get('pageName') or self._og_search_title(webpage)
1019 description = page_info.get('description') or self._og_search_description(webpage)
1020 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1021 return {
1022 'id': programme_id,
1023 'title': title,
1024 'description': description,
1025 'timestamp': timestamp,
1026 'formats': formats,
1027 'subtitles': subtitles,
1028 }
1029
1030 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1031 initial_data = self._parse_json(self._html_search_regex(
1032 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1033 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1034 if initial_data:
1035 init_data = try_get(
1036 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1037 smp_data = init_data.get('smpData') or {}
1038 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1039 version_id = clip_data.get('versionID')
1040 if version_id:
1041 title = smp_data['title']
1042 formats, subtitles = self._download_media_selector(version_id)
1043 self._sort_formats(formats)
1044 image_url = smp_data.get('holdingImageURL')
1045 display_date = init_data.get('displayDate')
1046 topic_title = init_data.get('topicTitle')
1047
1048 return {
1049 'id': version_id,
1050 'title': title,
1051 'formats': formats,
1052 'alt_title': init_data.get('shortTitle'),
1053 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1054 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1055 'upload_date': display_date.replace('-', '') if display_date else None,
1056 'subtitles': subtitles,
1057 'duration': int_or_none(clip_data.get('duration')),
1058 'categories': [topic_title] if topic_title else None,
1059 }
1060
1061 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1062 # There are several setPayload calls may be present but the video
1063 # seems to be always related to the first one
1064 morph_payload = self._parse_json(
1065 self._search_regex(
1066 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1067 webpage, 'morph payload', default='{}'),
1068 playlist_id, fatal=False)
1069 if morph_payload:
1070 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1071 for component in components:
1072 if not isinstance(component, dict):
1073 continue
1074 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1075 if not lead_media:
1076 continue
1077 identifiers = lead_media.get('identifiers')
1078 if not identifiers or not isinstance(identifiers, dict):
1079 continue
1080 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1081 if not programme_id:
1082 continue
1083 title = lead_media.get('title') or self._og_search_title(webpage)
1084 formats, subtitles = self._download_media_selector(programme_id)
1085 self._sort_formats(formats)
1086 description = lead_media.get('summary')
1087 uploader = lead_media.get('masterBrand')
1088 uploader_id = lead_media.get('mid')
1089 duration = None
1090 duration_d = lead_media.get('duration')
1091 if isinstance(duration_d, dict):
1092 duration = parse_duration(dict_get(
1093 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1094 return {
1095 'id': programme_id,
1096 'title': title,
1097 'description': description,
1098 'duration': duration,
1099 'uploader': uploader,
1100 'uploader_id': uploader_id,
1101 'formats': formats,
1102 'subtitles': subtitles,
1103 }
1104
1105 preload_state = self._parse_json(self._search_regex(
1106 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1107 'preload state', default='{}'), playlist_id, fatal=False)
1108 if preload_state:
1109 current_programme = preload_state.get('programmes', {}).get('current') or {}
1110 programme_id = current_programme.get('id')
1111 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1112 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1113 formats, subtitles = self._download_media_selector(programme_id)
1114 self._sort_formats(formats)
1115 synopses = current_programme.get('synopses') or {}
1116 network = current_programme.get('network') or {}
1117 duration = int_or_none(
1118 current_programme.get('duration', {}).get('value'))
1119 thumbnail = None
1120 image_url = current_programme.get('image_url')
1121 if image_url:
1122 thumbnail = image_url.replace('{recipe}', 'raw')
1123 return {
1124 'id': programme_id,
1125 'title': title,
1126 'description': dict_get(synopses, ('long', 'medium', 'short')),
1127 'thumbnail': thumbnail,
1128 'duration': duration,
1129 'uploader': network.get('short_title'),
1130 'uploader_id': network.get('id'),
1131 'formats': formats,
1132 'subtitles': subtitles,
1133 }
1134
1135 bbc3_config = self._parse_json(
1136 self._search_regex(
1137 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1138 'bbcthree config', default='{}'),
1139 playlist_id, transform_source=js_to_json, fatal=False) or {}
1140 payload = bbc3_config.get('payload') or {}
1141 if payload:
1142 clip = payload.get('currentClip') or {}
1143 clip_vpid = clip.get('vpid')
1144 clip_title = clip.get('title')
1145 if clip_vpid and clip_title:
1146 formats, subtitles = self._download_media_selector(clip_vpid)
1147 self._sort_formats(formats)
1148 return {
1149 'id': clip_vpid,
1150 'title': clip_title,
1151 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1152 'description': clip.get('description'),
1153 'duration': parse_duration(clip.get('duration')),
1154 'formats': formats,
1155 'subtitles': subtitles,
1156 }
1157 bbc3_playlist = try_get(
1158 payload, lambda x: x['content']['bbcMedia']['playlist'],
1159 dict)
1160 if bbc3_playlist:
1161 playlist_title = bbc3_playlist.get('title') or playlist_title
1162 thumbnail = bbc3_playlist.get('holdingImageURL')
1163 entries = []
1164 for bbc3_item in bbc3_playlist['items']:
1165 programme_id = bbc3_item.get('versionID')
1166 if not programme_id:
1167 continue
1168 formats, subtitles = self._download_media_selector(programme_id)
1169 self._sort_formats(formats)
1170 entries.append({
1171 'id': programme_id,
1172 'title': playlist_title,
1173 'thumbnail': thumbnail,
1174 'timestamp': timestamp,
1175 'formats': formats,
1176 'subtitles': subtitles,
1177 })
1178 return self.playlist_result(
1179 entries, playlist_id, playlist_title, playlist_description)
1180
1181 initial_data = self._search_regex(
1182 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1183 'quoted preload state', default=None)
1184 if initial_data is None:
1185 initial_data = self._search_regex(
1186 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1187 'preload state', default={})
1188 else:
1189 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1190 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1191 if initial_data:
1192 def parse_media(media):
1193 if not media:
1194 return
1195 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1196 item_id = item.get('id')
1197 item_title = item.get('title')
1198 if not (item_id and item_title):
1199 continue
1200 formats, subtitles = self._download_media_selector(item_id)
1201 self._sort_formats(formats)
1202 item_desc = None
1203 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1204 if blocks:
1205 summary = []
1206 for block in blocks:
1207 text = try_get(block, lambda x: x['model']['text'], compat_str)
1208 if text:
1209 summary.append(text)
1210 if summary:
1211 item_desc = '\n\n'.join(summary)
1212 item_time = None
1213 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1214 if try_get(meta, lambda x: x['label']) == 'Published':
1215 item_time = unified_timestamp(meta.get('timestamp'))
1216 break
1217 entries.append({
1218 'id': item_id,
1219 'title': item_title,
1220 'thumbnail': item.get('holdingImageUrl'),
1221 'formats': formats,
1222 'subtitles': subtitles,
1223 'timestamp': item_time,
1224 'description': strip_or_none(item_desc),
1225 })
1226 for resp in (initial_data.get('data') or {}).values():
1227 name = resp.get('name')
1228 if name == 'media-experience':
1229 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1230 elif name == 'article':
1231 for block in (try_get(resp,
1232 (lambda x: x['data']['blocks'],
1233 lambda x: x['data']['content']['model']['blocks'],),
1234 list) or []):
1235 if block.get('type') not in ['media', 'video']:
1236 continue
1237 parse_media(block.get('model'))
1238 return self.playlist_result(
1239 entries, playlist_id, playlist_title, playlist_description)
1240
1241 def extract_all(pattern):
1242 return list(filter(None, map(
1243 lambda s: self._parse_json(s, playlist_id, fatal=False),
1244 re.findall(pattern, webpage))))
1245
1246 # Multiple video article (e.g.
1247 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1248 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1249 entries = []
1250 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1251 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1252 if embed_url and re.match(EMBED_URL, embed_url):
1253 entries.append(embed_url)
1254 entries.extend(re.findall(
1255 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1256 if entries:
1257 return self.playlist_result(
1258 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1259 playlist_id, playlist_title, playlist_description)
1260
1261 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1262 medias = extract_all(r"data-media-meta='({[^']+})'")
1263
1264 if not medias:
1265 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1266 media_asset = self._search_regex(
1267 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1268 webpage, 'media asset', default=None)
1269 if media_asset:
1270 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1271 medias = []
1272 for video in media_asset_page.get('videos', {}).values():
1273 medias.extend(video.values())
1274
1275 if not medias:
1276 # Multiple video playlist with single `now playing` entry (e.g.
1277 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1278 vxp_playlist = self._parse_json(
1279 self._search_regex(
1280 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1281 webpage, 'playlist data'),
1282 playlist_id)
1283 playlist_medias = []
1284 for item in vxp_playlist:
1285 media = item.get('media')
1286 if not media:
1287 continue
1288 playlist_medias.append(media)
1289 # Download single video if found media with asset id matching the video id from URL
1290 if item.get('advert', {}).get('assetId') == playlist_id:
1291 medias = [media]
1292 break
1293 # Fallback to the whole playlist
1294 if not medias:
1295 medias = playlist_medias
1296
1297 entries = []
1298 for num, media_meta in enumerate(medias, start=1):
1299 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1300 if not formats and not self.get_param('ignore_no_formats'):
1301 continue
1302 self._sort_formats(formats)
1303
1304 video_id = media_meta.get('externalId')
1305 if not video_id:
1306 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1307
1308 title = media_meta.get('caption')
1309 if not title:
1310 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1311
1312 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1313
1314 images = []
1315 for image in media_meta.get('images', {}).values():
1316 images.extend(image.values())
1317 if 'image' in media_meta:
1318 images.append(media_meta['image'])
1319
1320 thumbnails = [{
1321 'url': image.get('href'),
1322 'width': int_or_none(image.get('width')),
1323 'height': int_or_none(image.get('height')),
1324 } for image in images]
1325
1326 entries.append({
1327 'id': video_id,
1328 'title': title,
1329 'thumbnails': thumbnails,
1330 'duration': duration,
1331 'timestamp': timestamp,
1332 'formats': formats,
1333 'subtitles': subtitles,
1334 })
1335
1336 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1337
1338
1339 class BBCCoUkArticleIE(InfoExtractor):
1340 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1341 IE_NAME = 'bbc.co.uk:article'
1342 IE_DESC = 'BBC articles'
1343
1344 _TEST = {
1345 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1346 'info_dict': {
1347 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1348 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1349 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1350 },
1351 'playlist_count': 4,
1352 'add_ie': ['BBCCoUk'],
1353 }
1354
1355 def _real_extract(self, url):
1356 playlist_id = self._match_id(url)
1357
1358 webpage = self._download_webpage(url, playlist_id)
1359
1360 title = self._og_search_title(webpage)
1361 description = self._og_search_description(webpage).strip()
1362
1363 entries = [self.url_result(programme_url) for programme_url in re.findall(
1364 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1365
1366 return self.playlist_result(entries, playlist_id, title, description)
1367
1368
1369 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1370 def _entries(self, webpage, url, playlist_id):
1371 single_page = 'page' in compat_urlparse.parse_qs(
1372 compat_urlparse.urlparse(url).query)
1373 for page_num in itertools.count(2):
1374 for video_id in re.findall(
1375 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1376 yield self.url_result(
1377 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1378 if single_page:
1379 return
1380 next_page = self._search_regex(
1381 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1382 webpage, 'next page url', default=None, group='url')
1383 if not next_page:
1384 break
1385 webpage = self._download_webpage(
1386 compat_urlparse.urljoin(url, next_page), playlist_id,
1387 'Downloading page %d' % page_num, page_num)
1388
1389 def _real_extract(self, url):
1390 playlist_id = self._match_id(url)
1391
1392 webpage = self._download_webpage(url, playlist_id)
1393
1394 title, description = self._extract_title_and_description(webpage)
1395
1396 return self.playlist_result(
1397 self._entries(webpage, url, playlist_id),
1398 playlist_id, title, description)
1399
1400
1401 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1402 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1403
1404 @staticmethod
1405 def _get_default(episode, key, default_key='default'):
1406 return try_get(episode, lambda x: x[key][default_key])
1407
1408 def _get_description(self, data):
1409 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1410 return dict_get(synopsis, ('large', 'medium', 'small'))
1411
1412 def _fetch_page(self, programme_id, per_page, series_id, page):
1413 elements = self._get_elements(self._call_api(
1414 programme_id, per_page, page + 1, series_id))
1415 for element in elements:
1416 episode = self._get_episode(element)
1417 episode_id = episode.get('id')
1418 if not episode_id:
1419 continue
1420 thumbnail = None
1421 image = self._get_episode_image(episode)
1422 if image:
1423 thumbnail = image.replace('{recipe}', 'raw')
1424 category = self._get_default(episode, 'labels', 'category')
1425 yield {
1426 '_type': 'url',
1427 'id': episode_id,
1428 'title': self._get_episode_field(episode, 'subtitle'),
1429 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1430 'thumbnail': thumbnail,
1431 'description': self._get_description(episode),
1432 'categories': [category] if category else None,
1433 'series': self._get_episode_field(episode, 'title'),
1434 'ie_key': BBCCoUkIE.ie_key(),
1435 }
1436
1437 def _real_extract(self, url):
1438 pid = self._match_id(url)
1439 qs = parse_qs(url)
1440 series_id = qs.get('seriesId', [None])[0]
1441 page = qs.get('page', [None])[0]
1442 per_page = 36 if page else self._PAGE_SIZE
1443 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1444 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1445 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1446 return self.playlist_result(
1447 entries, pid, self._get_playlist_title(playlist_data),
1448 self._get_description(playlist_data))
1449
1450
1451 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1452 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1453 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1454 _TESTS = [{
1455 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1456 'info_dict': {
1457 'id': 'b05rcz9v',
1458 'title': 'The Disappearance',
1459 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1460 },
1461 'playlist_mincount': 8,
1462 }, {
1463 # all seasons
1464 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1465 'info_dict': {
1466 'id': 'b094m5t9',
1467 'title': 'Doctor Foster',
1468 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1469 },
1470 'playlist_mincount': 10,
1471 }, {
1472 # explicit season
1473 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1474 'info_dict': {
1475 'id': 'b094m5t9',
1476 'title': 'Doctor Foster',
1477 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1478 },
1479 'playlist_mincount': 5,
1480 }, {
1481 # all pages
1482 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1483 'info_dict': {
1484 'id': 'm0004c4v',
1485 'title': 'Beechgrove',
1486 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1487 },
1488 'playlist_mincount': 37,
1489 }, {
1490 # explicit page
1491 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1492 'info_dict': {
1493 'id': 'm0004c4v',
1494 'title': 'Beechgrove',
1495 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1496 },
1497 'playlist_mincount': 1,
1498 }]
1499 _PAGE_SIZE = 100
1500 _DESCRIPTION_KEY = 'synopsis'
1501
1502 def _get_episode_image(self, episode):
1503 return self._get_default(episode, 'image')
1504
1505 def _get_episode_field(self, episode, field):
1506 return self._get_default(episode, field)
1507
1508 @staticmethod
1509 def _get_elements(data):
1510 return data['entities']['results']
1511
1512 @staticmethod
1513 def _get_episode(element):
1514 return element.get('episode') or {}
1515
1516 def _call_api(self, pid, per_page, page=1, series_id=None):
1517 variables = {
1518 'id': pid,
1519 'page': page,
1520 'perPage': per_page,
1521 }
1522 if series_id:
1523 variables['sliceId'] = series_id
1524 return self._download_json(
1525 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1526 'Content-Type': 'application/json'
1527 }, data=json.dumps({
1528 'id': '5692d93d5aac8d796a0305e895e61551',
1529 'variables': variables,
1530 }).encode('utf-8'))['data']['programme']
1531
1532 @staticmethod
1533 def _get_playlist_data(data):
1534 return data
1535
1536 def _get_playlist_title(self, data):
1537 return self._get_default(data, 'title')
1538
1539
1540 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1541 IE_NAME = 'bbc.co.uk:iplayer:group'
1542 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1543 _TESTS = [{
1544 # Available for over a year unlike 30 days for most other programmes
1545 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1546 'info_dict': {
1547 'id': 'p02tcc32',
1548 'title': 'Bohemian Icons',
1549 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1550 },
1551 'playlist_mincount': 10,
1552 }, {
1553 # all pages
1554 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1555 'info_dict': {
1556 'id': 'p081d7j7',
1557 'title': 'Music in Scotland',
1558 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1559 },
1560 'playlist_mincount': 47,
1561 }, {
1562 # explicit page
1563 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1564 'info_dict': {
1565 'id': 'p081d7j7',
1566 'title': 'Music in Scotland',
1567 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1568 },
1569 'playlist_mincount': 11,
1570 }]
1571 _PAGE_SIZE = 200
1572 _DESCRIPTION_KEY = 'synopses'
1573
1574 def _get_episode_image(self, episode):
1575 return self._get_default(episode, 'images', 'standard')
1576
1577 def _get_episode_field(self, episode, field):
1578 return episode.get(field)
1579
1580 @staticmethod
1581 def _get_elements(data):
1582 return data['elements']
1583
1584 @staticmethod
1585 def _get_episode(element):
1586 return element
1587
1588 def _call_api(self, pid, per_page, page=1, series_id=None):
1589 return self._download_json(
1590 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1591 pid, query={
1592 'page': page,
1593 'per_page': per_page,
1594 })['group_episodes']
1595
1596 @staticmethod
1597 def _get_playlist_data(data):
1598 return data['group']
1599
1600 def _get_playlist_title(self, data):
1601 return data.get('title')
1602
1603
1604 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1605 IE_NAME = 'bbc.co.uk:playlist'
1606 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1607 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1608 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1609 _TESTS = [{
1610 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1611 'info_dict': {
1612 'id': 'b05rcz9v',
1613 'title': 'The Disappearance - Clips - BBC Four',
1614 'description': 'French thriller serial about a missing teenager.',
1615 },
1616 'playlist_mincount': 7,
1617 }, {
1618 # multipage playlist, explicit page
1619 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1620 'info_dict': {
1621 'id': 'b00mfl7n',
1622 'title': 'Frozen Planet - Clips - BBC One',
1623 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1624 },
1625 'playlist_mincount': 24,
1626 }, {
1627 # multipage playlist, all pages
1628 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1629 'info_dict': {
1630 'id': 'b00mfl7n',
1631 'title': 'Frozen Planet - Clips - BBC One',
1632 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1633 },
1634 'playlist_mincount': 142,
1635 }, {
1636 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1637 'only_matching': True,
1638 }, {
1639 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1640 'only_matching': True,
1641 }, {
1642 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1643 'only_matching': True,
1644 }]
1645
1646 def _extract_title_and_description(self, webpage):
1647 title = self._og_search_title(webpage, fatal=False)
1648 description = self._og_search_description(webpage)
1649 return title, description