]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/bbc.py
[extractor/youtube] Add client name to `format_note` when `-v` (#6254)
[yt-dlp.git] / yt_dlp / extractor / bbc.py
1 import functools
2 import itertools
3 import json
4 import re
5 import urllib.error
6 import xml.etree.ElementTree
7
8 from .common import InfoExtractor
9 from ..compat import compat_HTTPError, compat_str, compat_urlparse
10 from ..utils import (
11 ExtractorError,
12 OnDemandPagedList,
13 clean_html,
14 dict_get,
15 float_or_none,
16 get_element_by_class,
17 int_or_none,
18 js_to_json,
19 parse_duration,
20 parse_iso8601,
21 parse_qs,
22 strip_or_none,
23 try_get,
24 unescapeHTML,
25 unified_timestamp,
26 url_or_none,
27 urlencode_postdata,
28 urljoin,
29 )
30
31
32 class BBCCoUkIE(InfoExtractor):
33 IE_NAME = 'bbc.co.uk'
34 IE_DESC = 'BBC iPlayer'
35 _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
36 _VALID_URL = r'''(?x)
37 https?://
38 (?:www\.)?bbc\.co\.uk/
39 (?:
40 programmes/(?!articles/)|
41 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
42 music/(?:clips|audiovideo/popular)[/#]|
43 radio/player/|
44 sounds/play/|
45 events/[^/]+/play/[^/]+/
46 )
47 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
48 ''' % _ID_REGEX
49 _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
50
51 _LOGIN_URL = 'https://account.bbc.com/signin'
52 _NETRC_MACHINE = 'bbc'
53
54 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
55 _MEDIA_SETS = [
56 # Provides HQ HLS streams with even better quality that pc mediaset but fails
57 # with geolocation in some cases when it's even not geo restricted at all (e.g.
58 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
59 'iptv-all',
60 'pc',
61 ]
62
63 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
64
65 _TESTS = [
66 {
67 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
68 'info_dict': {
69 'id': 'b039d07m',
70 'ext': 'flv',
71 'title': 'Kaleidoscope, Leonard Cohen',
72 'description': 'The Canadian poet and songwriter reflects on his musical career.',
73 },
74 'params': {
75 # rtmp download
76 'skip_download': True,
77 }
78 },
79 {
80 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
81 'info_dict': {
82 'id': 'b00yng1d',
83 'ext': 'flv',
84 'title': 'The Man in Black: Series 3: The Printed Name',
85 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
86 'duration': 1800,
87 },
88 'params': {
89 # rtmp download
90 'skip_download': True,
91 },
92 'skip': 'Episode is no longer available on BBC iPlayer Radio',
93 },
94 {
95 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
96 'info_dict': {
97 'id': 'b00yng1d',
98 'ext': 'flv',
99 'title': 'The Voice UK: Series 3: Blind Auditions 5',
100 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
101 'duration': 5100,
102 },
103 'params': {
104 # rtmp download
105 'skip_download': True,
106 },
107 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
108 },
109 {
110 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
111 'info_dict': {
112 'id': 'b03k3pb7',
113 'ext': 'flv',
114 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
115 'description': '2. Invasion',
116 'duration': 3600,
117 },
118 'params': {
119 # rtmp download
120 'skip_download': True,
121 },
122 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
123 }, {
124 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
125 'info_dict': {
126 'id': 'b04v209v',
127 'ext': 'flv',
128 'title': 'Pete Tong, The Essential New Tune Special',
129 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
130 'duration': 10800,
131 },
132 'params': {
133 # rtmp download
134 'skip_download': True,
135 },
136 'skip': 'Episode is no longer available on BBC iPlayer Radio',
137 }, {
138 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
139 'note': 'Audio',
140 'info_dict': {
141 'id': 'p022h44j',
142 'ext': 'flv',
143 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
144 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
145 'duration': 227,
146 },
147 'params': {
148 # rtmp download
149 'skip_download': True,
150 }
151 }, {
152 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
153 'note': 'Video',
154 'info_dict': {
155 'id': 'p025c103',
156 'ext': 'flv',
157 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
158 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
159 'duration': 226,
160 },
161 'params': {
162 # rtmp download
163 'skip_download': True,
164 }
165 }, {
166 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
167 'info_dict': {
168 'id': 'p02n76xf',
169 'ext': 'flv',
170 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
171 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
172 'duration': 3540,
173 },
174 'params': {
175 # rtmp download
176 'skip_download': True,
177 },
178 'skip': 'geolocation',
179 }, {
180 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
181 'info_dict': {
182 'id': 'b05zmgw1',
183 'ext': 'flv',
184 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
185 'title': 'Royal Academy Summer Exhibition',
186 'duration': 3540,
187 },
188 'params': {
189 # rtmp download
190 'skip_download': True,
191 },
192 'skip': 'geolocation',
193 }, {
194 # iptv-all mediaset fails with geolocation however there is no geo restriction
195 # for this programme at all
196 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
197 'info_dict': {
198 'id': 'b06rkms3',
199 'ext': 'flv',
200 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
201 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
202 },
203 'params': {
204 # rtmp download
205 'skip_download': True,
206 },
207 'skip': 'Now it\'s really geo-restricted',
208 }, {
209 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
210 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
211 'info_dict': {
212 'id': 'p028bfkj',
213 'ext': 'flv',
214 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
216 },
217 'params': {
218 # rtmp download
219 'skip_download': True,
220 },
221 }, {
222 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
223 'note': 'Audio',
224 'info_dict': {
225 'id': 'm0007jz9',
226 'ext': 'mp4',
227 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
228 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
229 'duration': 9840,
230 },
231 'params': {
232 # rtmp download
233 'skip_download': True,
234 }
235 }, {
236 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
237 'only_matching': True,
238 }, {
239 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
240 'only_matching': True,
241 }, {
242 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
243 'only_matching': True,
244 }, {
245 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
246 'only_matching': True,
247 }, {
248 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
249 'only_matching': True,
250 }, {
251 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
252 'only_matching': True,
253 }, {
254 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
255 'only_matching': True,
256 }, {
257 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
258 'only_matching': True,
259 }]
260
261 def _perform_login(self, username, password):
262 login_page = self._download_webpage(
263 self._LOGIN_URL, None, 'Downloading signin page')
264
265 login_form = self._hidden_inputs(login_page)
266
267 login_form.update({
268 'username': username,
269 'password': password,
270 })
271
272 post_url = urljoin(self._LOGIN_URL, self._search_regex(
273 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
274 'post url', default=self._LOGIN_URL, group='url'))
275
276 response, urlh = self._download_webpage_handle(
277 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
278 headers={'Referer': self._LOGIN_URL})
279
280 if self._LOGIN_URL in urlh.geturl():
281 error = clean_html(get_element_by_class('form-message', response))
282 if error:
283 raise ExtractorError(
284 'Unable to login: %s' % error, expected=True)
285 raise ExtractorError('Unable to log in')
286
287 class MediaSelectionError(Exception):
288 def __init__(self, id):
289 self.id = id
290
291 def _extract_asx_playlist(self, connection, programme_id):
292 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
293 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
294
295 def _extract_items(self, playlist):
296 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
297
298 def _extract_medias(self, media_selection):
299 error = media_selection.get('result')
300 if error:
301 raise BBCCoUkIE.MediaSelectionError(error)
302 return media_selection.get('media') or []
303
304 def _extract_connections(self, media):
305 return media.get('connection') or []
306
307 def _get_subtitles(self, media, programme_id):
308 subtitles = {}
309 for connection in self._extract_connections(media):
310 cc_url = url_or_none(connection.get('href'))
311 if not cc_url:
312 continue
313 captions = self._download_xml(
314 cc_url, programme_id, 'Downloading captions', fatal=False)
315 if not isinstance(captions, xml.etree.ElementTree.Element):
316 continue
317 subtitles['en'] = [
318 {
319 'url': connection.get('href'),
320 'ext': 'ttml',
321 },
322 ]
323 break
324 return subtitles
325
326 def _raise_extractor_error(self, media_selection_error):
327 raise ExtractorError(
328 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
329 expected=True)
330
331 def _download_media_selector(self, programme_id):
332 last_exception = None
333 for media_set in self._MEDIA_SETS:
334 try:
335 return self._download_media_selector_url(
336 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
337 except BBCCoUkIE.MediaSelectionError as e:
338 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
339 last_exception = e
340 continue
341 self._raise_extractor_error(e)
342 self._raise_extractor_error(last_exception)
343
344 def _download_media_selector_url(self, url, programme_id=None):
345 media_selection = self._download_json(
346 url, programme_id, 'Downloading media selection JSON',
347 expected_status=(403, 404))
348 return self._process_media_selector(media_selection, programme_id)
349
350 def _process_media_selector(self, media_selection, programme_id):
351 formats = []
352 subtitles = None
353 urls = []
354
355 for media in self._extract_medias(media_selection):
356 kind = media.get('kind')
357 if kind in ('video', 'audio'):
358 bitrate = int_or_none(media.get('bitrate'))
359 encoding = media.get('encoding')
360 width = int_or_none(media.get('width'))
361 height = int_or_none(media.get('height'))
362 file_size = int_or_none(media.get('media_file_size'))
363 for connection in self._extract_connections(media):
364 href = connection.get('href')
365 if href in urls:
366 continue
367 if href:
368 urls.append(href)
369 conn_kind = connection.get('kind')
370 protocol = connection.get('protocol')
371 supplier = connection.get('supplier')
372 transfer_format = connection.get('transferFormat')
373 format_id = supplier or conn_kind or protocol
374 # ASX playlist
375 if supplier == 'asx':
376 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
377 formats.append({
378 'url': ref,
379 'format_id': 'ref%s_%s' % (i, format_id),
380 })
381 elif transfer_format == 'dash':
382 formats.extend(self._extract_mpd_formats(
383 href, programme_id, mpd_id=format_id, fatal=False))
384 elif transfer_format == 'hls':
385 # TODO: let expected_status be passed into _extract_xxx_formats() instead
386 try:
387 fmts = self._extract_m3u8_formats(
388 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
389 m3u8_id=format_id, fatal=False)
390 except ExtractorError as e:
391 if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
392 and e.exc_info[1].code in (403, 404)):
393 raise
394 fmts = []
395 formats.extend(fmts)
396 elif transfer_format == 'hds':
397 formats.extend(self._extract_f4m_formats(
398 href, programme_id, f4m_id=format_id, fatal=False))
399 else:
400 if not supplier and bitrate:
401 format_id += '-%d' % bitrate
402 fmt = {
403 'format_id': format_id,
404 'filesize': file_size,
405 }
406 if kind == 'video':
407 fmt.update({
408 'width': width,
409 'height': height,
410 'tbr': bitrate,
411 'vcodec': encoding,
412 })
413 else:
414 fmt.update({
415 'abr': bitrate,
416 'acodec': encoding,
417 'vcodec': 'none',
418 })
419 if protocol in ('http', 'https'):
420 # Direct link
421 fmt.update({
422 'url': href,
423 })
424 elif protocol == 'rtmp':
425 application = connection.get('application', 'ondemand')
426 auth_string = connection.get('authString')
427 identifier = connection.get('identifier')
428 server = connection.get('server')
429 fmt.update({
430 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
431 'play_path': identifier,
432 'app': '%s?%s' % (application, auth_string),
433 'page_url': 'http://www.bbc.co.uk',
434 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
435 'rtmp_live': False,
436 'ext': 'flv',
437 })
438 else:
439 continue
440 formats.append(fmt)
441 elif kind == 'captions':
442 subtitles = self.extract_subtitles(media, programme_id)
443 return formats, subtitles
444
445 def _download_playlist(self, playlist_id):
446 try:
447 playlist = self._download_json(
448 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
449 playlist_id, 'Downloading playlist JSON')
450 formats = []
451 subtitles = {}
452
453 for version in playlist.get('allAvailableVersions', []):
454 smp_config = version['smpConfig']
455 title = smp_config['title']
456 description = smp_config['summary']
457 for item in smp_config['items']:
458 kind = item['kind']
459 if kind not in ('programme', 'radioProgramme'):
460 continue
461 programme_id = item.get('vpid')
462 duration = int_or_none(item.get('duration'))
463 version_formats, version_subtitles = self._download_media_selector(programme_id)
464 types = version['types']
465 for f in version_formats:
466 f['format_note'] = ', '.join(types)
467 if any('AudioDescribed' in x for x in types):
468 f['language_preference'] = -10
469 formats += version_formats
470 for tag, subformats in (version_subtitles or {}).items():
471 subtitles.setdefault(tag, []).extend(subformats)
472
473 return programme_id, title, description, duration, formats, subtitles
474 except ExtractorError as ee:
475 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
476 raise
477
478 # fallback to legacy playlist
479 return self._process_legacy_playlist(playlist_id)
480
481 def _process_legacy_playlist_url(self, url, display_id):
482 playlist = self._download_legacy_playlist_url(url, display_id)
483 return self._extract_from_legacy_playlist(playlist, display_id)
484
485 def _process_legacy_playlist(self, playlist_id):
486 return self._process_legacy_playlist_url(
487 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
488
489 def _download_legacy_playlist_url(self, url, playlist_id=None):
490 return self._download_xml(
491 url, playlist_id, 'Downloading legacy playlist XML')
492
493 def _extract_from_legacy_playlist(self, playlist, playlist_id):
494 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
495 if no_items is not None:
496 reason = no_items.get('reason')
497 if reason == 'preAvailability':
498 msg = 'Episode %s is not yet available' % playlist_id
499 elif reason == 'postAvailability':
500 msg = 'Episode %s is no longer available' % playlist_id
501 elif reason == 'noMedia':
502 msg = 'Episode %s is not currently available' % playlist_id
503 else:
504 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
505 raise ExtractorError(msg, expected=True)
506
507 for item in self._extract_items(playlist):
508 kind = item.get('kind')
509 if kind not in ('programme', 'radioProgramme'):
510 continue
511 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
512 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
513 description = description_el.text if description_el is not None else None
514
515 def get_programme_id(item):
516 def get_from_attributes(item):
517 for p in ('identifier', 'group'):
518 value = item.get(p)
519 if value and re.match(r'^[pb][\da-z]{7}$', value):
520 return value
521 get_from_attributes(item)
522 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
523 if mediator is not None:
524 return get_from_attributes(mediator)
525
526 programme_id = get_programme_id(item)
527 duration = int_or_none(item.get('duration'))
528
529 if programme_id:
530 formats, subtitles = self._download_media_selector(programme_id)
531 else:
532 formats, subtitles = self._process_media_selector(item, playlist_id)
533 programme_id = playlist_id
534
535 return programme_id, title, description, duration, formats, subtitles
536
537 def _real_extract(self, url):
538 group_id = self._match_id(url)
539
540 webpage = self._download_webpage(url, group_id, 'Downloading video page')
541
542 error = self._search_regex(
543 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
544 webpage, 'error', default=None)
545 if error:
546 raise ExtractorError(error, expected=True)
547
548 programme_id = None
549 duration = None
550
551 tviplayer = self._search_regex(
552 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
553 webpage, 'player', default=None)
554
555 if tviplayer:
556 player = self._parse_json(tviplayer, group_id).get('player', {})
557 duration = int_or_none(player.get('duration'))
558 programme_id = player.get('vpid')
559
560 if not programme_id:
561 programme_id = self._search_regex(
562 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
563
564 if programme_id:
565 formats, subtitles = self._download_media_selector(programme_id)
566 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
567 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
568 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
569 description = self._search_regex(
570 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
571 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
572 webpage, 'description', default=None)
573 if not description:
574 description = self._html_search_meta('description', webpage)
575 else:
576 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
577
578 return {
579 'id': programme_id,
580 'title': title,
581 'description': description,
582 'thumbnail': self._og_search_thumbnail(webpage, default=None),
583 'duration': duration,
584 'formats': formats,
585 'subtitles': subtitles,
586 }
587
588
589 class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
590 IE_NAME = 'bbc'
591 IE_DESC = 'BBC'
592 _VALID_URL = r'''(?x)
593 https?://(?:www\.)?(?:
594 bbc\.(?:com|co\.uk)|
595 bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
596 bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
597 )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
598
599 _MEDIA_SETS = [
600 'pc',
601 'mobile-tablet-main',
602 ]
603
604 _TESTS = [{
605 # article with multiple videos embedded with data-playable containing vpids
606 'url': 'http://www.bbc.com/news/world-europe-32668511',
607 'info_dict': {
608 'id': 'world-europe-32668511',
609 'title': 'Russia stages massive WW2 parade',
610 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
611 },
612 'playlist_count': 2,
613 }, {
614 # article with multiple videos embedded with data-playable (more videos)
615 'url': 'http://www.bbc.com/news/business-28299555',
616 'info_dict': {
617 'id': 'business-28299555',
618 'title': 'Farnborough Airshow: Video highlights',
619 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
620 },
621 'playlist_count': 9,
622 'skip': 'Save time',
623 }, {
624 # article with multiple videos embedded with `new SMP()`
625 # broken
626 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
627 'info_dict': {
628 'id': '3662a707-0af9-3149-963f-47bea720b460',
629 'title': 'BUGGER',
630 },
631 'playlist_count': 18,
632 }, {
633 # single video embedded with data-playable containing vpid
634 'url': 'http://www.bbc.com/news/world-europe-32041533',
635 'info_dict': {
636 'id': 'p02mprgb',
637 'ext': 'mp4',
638 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
639 'description': 'md5:2868290467291b37feda7863f7a83f54',
640 'duration': 47,
641 'timestamp': 1427219242,
642 'upload_date': '20150324',
643 },
644 'params': {
645 # rtmp download
646 'skip_download': True,
647 }
648 }, {
649 # article with single video embedded with data-playable containing XML playlist
650 # with direct video links as progressiveDownloadUrl (for now these are extracted)
651 # and playlist with f4m and m3u8 as streamingUrl
652 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
653 'info_dict': {
654 'id': '150615_telabyad_kentin_cogu',
655 'ext': 'mp4',
656 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
657 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
658 'timestamp': 1434397334,
659 'upload_date': '20150615',
660 },
661 'params': {
662 'skip_download': True,
663 }
664 }, {
665 # single video embedded with data-playable containing XML playlists (regional section)
666 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
667 'info_dict': {
668 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
669 'ext': 'mp4',
670 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
671 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
672 'timestamp': 1434713142,
673 'upload_date': '20150619',
674 },
675 'params': {
676 'skip_download': True,
677 }
678 }, {
679 # single video from video playlist embedded with vxp-playlist-data JSON
680 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
681 'info_dict': {
682 'id': 'p02w6qjc',
683 'ext': 'mp4',
684 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
685 'duration': 56,
686 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
687 },
688 'params': {
689 'skip_download': True,
690 }
691 }, {
692 # single video story with digitalData
693 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
694 'info_dict': {
695 'id': 'p02q6gc4',
696 'ext': 'flv',
697 'title': 'Sri Lanka’s spicy secret',
698 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
699 'timestamp': 1437674293,
700 'upload_date': '20150723',
701 },
702 'params': {
703 # rtmp download
704 'skip_download': True,
705 }
706 }, {
707 # single video story without digitalData
708 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
709 'info_dict': {
710 'id': 'p018zqqg',
711 'ext': 'mp4',
712 'title': 'Hyundai Santa Fe Sport: Rock star',
713 'description': 'md5:b042a26142c4154a6e472933cf20793d',
714 'timestamp': 1415867444,
715 'upload_date': '20141113',
716 },
717 'params': {
718 # rtmp download
719 'skip_download': True,
720 }
721 }, {
722 # single video embedded with Morph
723 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
724 'info_dict': {
725 'id': 'p041vhd0',
726 'ext': 'mp4',
727 'title': "Nigeria v Japan - Men's First Round",
728 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
729 'duration': 7980,
730 'uploader': 'BBC Sport',
731 'uploader_id': 'bbc_sport',
732 },
733 'params': {
734 # m3u8 download
735 'skip_download': True,
736 },
737 'skip': 'Georestricted to UK',
738 }, {
739 # single video with playlist.sxml URL in playlist param
740 'url': 'http://www.bbc.com/sport/0/football/33653409',
741 'info_dict': {
742 'id': 'p02xycnp',
743 'ext': 'mp4',
744 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
745 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
746 'duration': 140,
747 },
748 'params': {
749 # rtmp download
750 'skip_download': True,
751 }
752 }, {
753 # article with multiple videos embedded with playlist.sxml in playlist param
754 'url': 'http://www.bbc.com/sport/0/football/34475836',
755 'info_dict': {
756 'id': '34475836',
757 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
758 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
759 },
760 'playlist_count': 3,
761 }, {
762 # school report article with single video
763 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
764 'info_dict': {
765 'id': '35744779',
766 'title': 'School which breaks down barriers in Jerusalem',
767 },
768 'playlist_count': 1,
769 }, {
770 # single video with playlist URL from weather section
771 'url': 'http://www.bbc.com/weather/features/33601775',
772 'only_matching': True,
773 }, {
774 # custom redirection to www.bbc.com
775 # also, video with window.__INITIAL_DATA__
776 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
777 'info_dict': {
778 'id': 'p02xzws1',
779 'ext': 'mp4',
780 'title': "Pluto may have 'nitrogen glaciers'",
781 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
782 'thumbnail': r're:https?://.+/.+\.jpg',
783 'timestamp': 1437785037,
784 'upload_date': '20150725',
785 },
786 }, {
787 # video with window.__INITIAL_DATA__ and value as JSON string
788 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
789 'info_dict': {
790 'id': 'p0b71qth',
791 'ext': 'mp4',
792 'title': 'Why France is making this woman a national hero',
793 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
794 'thumbnail': r're:https?://.+/.+\.jpg',
795 'timestamp': 1638230731,
796 'upload_date': '20211130',
797 },
798 }, {
799 # single video article embedded with data-media-vpid
800 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
801 'only_matching': True,
802 }, {
803 # bbcthreeConfig
804 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
805 'info_dict': {
806 'id': 'p06556y7',
807 'ext': 'mp4',
808 'title': 'Things Not To Say to people that live on council estates',
809 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
810 'duration': 360,
811 'thumbnail': r're:https?://.+/.+\.jpg',
812 },
813 }, {
814 # window.__PRELOADED_STATE__
815 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
816 'info_dict': {
817 'id': 'b0b9z4vz',
818 'ext': 'mp4',
819 'title': 'Prom 6: An American in Paris and Turangalila',
820 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
821 'uploader': 'Radio 3',
822 'uploader_id': 'bbc_radio_three',
823 },
824 }, {
825 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
826 'info_dict': {
827 'id': 'p06w9tws',
828 'ext': 'mp4',
829 'title': 'md5:2fabf12a726603193a2879a055f72514',
830 'description': 'Learn English words and phrases from this story',
831 },
832 'add_ie': [BBCCoUkIE.ie_key()],
833 }, {
834 # BBC Reel
835 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
836 'info_dict': {
837 'id': 'p07c6sb9',
838 'ext': 'mp4',
839 'title': 'How positive thinking is harming your happiness',
840 'alt_title': 'The downsides of positive thinking',
841 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
842 'duration': 235,
843 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
844 'upload_date': '20190604',
845 'categories': ['Psychology'],
846 },
847 }, { # onion routes
848 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
849 'only_matching': True,
850 }, {
851 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
852 'only_matching': True,
853 }]
854
855 @classmethod
856 def suitable(cls, url):
857 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
858 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
859 else super(BBCIE, cls).suitable(url))
860
861 def _extract_from_media_meta(self, media_meta, video_id):
862 # Direct links to media in media metadata (e.g.
863 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
864 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
865 source_files = media_meta.get('sourceFiles')
866 if source_files:
867 return [{
868 'url': f['url'],
869 'format_id': format_id,
870 'ext': f.get('encoding'),
871 'tbr': float_or_none(f.get('bitrate'), 1000),
872 'filesize': int_or_none(f.get('filesize')),
873 } for format_id, f in source_files.items() if f.get('url')], []
874
875 programme_id = media_meta.get('externalId')
876 if programme_id:
877 return self._download_media_selector(programme_id)
878
879 # Process playlist.sxml as legacy playlist
880 href = media_meta.get('href')
881 if href:
882 playlist = self._download_legacy_playlist_url(href)
883 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
884 return formats, subtitles
885
886 return [], []
887
888 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
889 programme_id, title, description, duration, formats, subtitles = \
890 self._process_legacy_playlist_url(url, playlist_id)
891 return {
892 'id': programme_id,
893 'title': title,
894 'description': description,
895 'duration': duration,
896 'timestamp': timestamp,
897 'formats': formats,
898 'subtitles': subtitles,
899 }
900
901 def _real_extract(self, url):
902 playlist_id = self._match_id(url)
903
904 webpage = self._download_webpage(url, playlist_id)
905
906 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
907 timestamp = json_ld_info.get('timestamp')
908
909 playlist_title = json_ld_info.get('title') or re.sub(
910 r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
911
912 playlist_description = json_ld_info.get(
913 'description') or self._og_search_description(webpage, default=None)
914
915 if not timestamp:
916 timestamp = parse_iso8601(self._search_regex(
917 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
918 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
919 r'"datePublished":\s*"([^"]+)'],
920 webpage, 'date', default=None))
921
922 entries = []
923
924 # article with multiple videos embedded with playlist.sxml (e.g.
925 # http://www.bbc.com/sport/0/football/34475836)
926 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
927 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
928 if playlists:
929 entries = [
930 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
931 for playlist_url in playlists]
932
933 # news article with multiple videos embedded with data-playable
934 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
935 if data_playables:
936 for _, data_playable_json in data_playables:
937 data_playable = self._parse_json(
938 unescapeHTML(data_playable_json), playlist_id, fatal=False)
939 if not data_playable:
940 continue
941 settings = data_playable.get('settings', {})
942 if settings:
943 # data-playable with video vpid in settings.playlistObject.items (e.g.
944 # http://www.bbc.com/news/world-us-canada-34473351)
945 playlist_object = settings.get('playlistObject', {})
946 if playlist_object:
947 items = playlist_object.get('items')
948 if items and isinstance(items, list):
949 title = playlist_object['title']
950 description = playlist_object.get('summary')
951 duration = int_or_none(items[0].get('duration'))
952 programme_id = items[0].get('vpid')
953 formats, subtitles = self._download_media_selector(programme_id)
954 entries.append({
955 'id': programme_id,
956 'title': title,
957 'description': description,
958 'timestamp': timestamp,
959 'duration': duration,
960 'formats': formats,
961 'subtitles': subtitles,
962 })
963 else:
964 # data-playable without vpid but with a playlist.sxml URLs
965 # in otherSettings.playlist (e.g.
966 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
967 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
968 if playlist:
969 entry = None
970 for key in ('streaming', 'progressiveDownload'):
971 playlist_url = playlist.get('%sUrl' % key)
972 if not playlist_url:
973 continue
974 try:
975 info = self._extract_from_playlist_sxml(
976 playlist_url, playlist_id, timestamp)
977 if not entry:
978 entry = info
979 else:
980 entry['title'] = info['title']
981 entry['formats'].extend(info['formats'])
982 except ExtractorError as e:
983 # Some playlist URL may fail with 500, at the same time
984 # the other one may work fine (e.g.
985 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
986 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
987 continue
988 raise
989 if entry:
990 entries.append(entry)
991
992 if entries:
993 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
994
995 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
996 group_id = self._search_regex(
997 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
998 webpage, 'group id', default=None)
999 if group_id:
1000 return self.url_result(
1001 'https://www.bbc.co.uk/programmes/%s' % group_id,
1002 ie=BBCCoUkIE.ie_key())
1003
1004 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1005 programme_id = self._search_regex(
1006 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1007 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1008 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1009 webpage, 'vpid', default=None)
1010
1011 if programme_id:
1012 formats, subtitles = self._download_media_selector(programme_id)
1013 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1014 digital_data = self._parse_json(
1015 self._search_regex(
1016 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1017 programme_id, fatal=False)
1018 page_info = digital_data.get('page', {}).get('pageInfo', {})
1019 title = page_info.get('pageName') or self._og_search_title(webpage)
1020 description = page_info.get('description') or self._og_search_description(webpage)
1021 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1022 return {
1023 'id': programme_id,
1024 'title': title,
1025 'description': description,
1026 'timestamp': timestamp,
1027 'formats': formats,
1028 'subtitles': subtitles,
1029 }
1030
1031 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1032 initial_data = self._parse_json(self._html_search_regex(
1033 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1034 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1035 if initial_data:
1036 init_data = try_get(
1037 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1038 smp_data = init_data.get('smpData') or {}
1039 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1040 version_id = clip_data.get('versionID')
1041 if version_id:
1042 title = smp_data['title']
1043 formats, subtitles = self._download_media_selector(version_id)
1044 image_url = smp_data.get('holdingImageURL')
1045 display_date = init_data.get('displayDate')
1046 topic_title = init_data.get('topicTitle')
1047
1048 return {
1049 'id': version_id,
1050 'title': title,
1051 'formats': formats,
1052 'alt_title': init_data.get('shortTitle'),
1053 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1054 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1055 'upload_date': display_date.replace('-', '') if display_date else None,
1056 'subtitles': subtitles,
1057 'duration': int_or_none(clip_data.get('duration')),
1058 'categories': [topic_title] if topic_title else None,
1059 }
1060
1061 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1062 # There are several setPayload calls may be present but the video
1063 # seems to be always related to the first one
1064 morph_payload = self._parse_json(
1065 self._search_regex(
1066 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1067 webpage, 'morph payload', default='{}'),
1068 playlist_id, fatal=False)
1069 if morph_payload:
1070 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1071 for component in components:
1072 if not isinstance(component, dict):
1073 continue
1074 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1075 if not lead_media:
1076 continue
1077 identifiers = lead_media.get('identifiers')
1078 if not identifiers or not isinstance(identifiers, dict):
1079 continue
1080 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1081 if not programme_id:
1082 continue
1083 title = lead_media.get('title') or self._og_search_title(webpage)
1084 formats, subtitles = self._download_media_selector(programme_id)
1085 description = lead_media.get('summary')
1086 uploader = lead_media.get('masterBrand')
1087 uploader_id = lead_media.get('mid')
1088 duration = None
1089 duration_d = lead_media.get('duration')
1090 if isinstance(duration_d, dict):
1091 duration = parse_duration(dict_get(
1092 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1093 return {
1094 'id': programme_id,
1095 'title': title,
1096 'description': description,
1097 'duration': duration,
1098 'uploader': uploader,
1099 'uploader_id': uploader_id,
1100 'formats': formats,
1101 'subtitles': subtitles,
1102 }
1103
1104 preload_state = self._parse_json(self._search_regex(
1105 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1106 'preload state', default='{}'), playlist_id, fatal=False)
1107 if preload_state:
1108 current_programme = preload_state.get('programmes', {}).get('current') or {}
1109 programme_id = current_programme.get('id')
1110 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1111 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1112 formats, subtitles = self._download_media_selector(programme_id)
1113 synopses = current_programme.get('synopses') or {}
1114 network = current_programme.get('network') or {}
1115 duration = int_or_none(
1116 current_programme.get('duration', {}).get('value'))
1117 thumbnail = None
1118 image_url = current_programme.get('image_url')
1119 if image_url:
1120 thumbnail = image_url.replace('{recipe}', 'raw')
1121 return {
1122 'id': programme_id,
1123 'title': title,
1124 'description': dict_get(synopses, ('long', 'medium', 'short')),
1125 'thumbnail': thumbnail,
1126 'duration': duration,
1127 'uploader': network.get('short_title'),
1128 'uploader_id': network.get('id'),
1129 'formats': formats,
1130 'subtitles': subtitles,
1131 }
1132
1133 bbc3_config = self._parse_json(
1134 self._search_regex(
1135 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1136 'bbcthree config', default='{}'),
1137 playlist_id, transform_source=js_to_json, fatal=False) or {}
1138 payload = bbc3_config.get('payload') or {}
1139 if payload:
1140 clip = payload.get('currentClip') or {}
1141 clip_vpid = clip.get('vpid')
1142 clip_title = clip.get('title')
1143 if clip_vpid and clip_title:
1144 formats, subtitles = self._download_media_selector(clip_vpid)
1145 return {
1146 'id': clip_vpid,
1147 'title': clip_title,
1148 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1149 'description': clip.get('description'),
1150 'duration': parse_duration(clip.get('duration')),
1151 'formats': formats,
1152 'subtitles': subtitles,
1153 }
1154 bbc3_playlist = try_get(
1155 payload, lambda x: x['content']['bbcMedia']['playlist'],
1156 dict)
1157 if bbc3_playlist:
1158 playlist_title = bbc3_playlist.get('title') or playlist_title
1159 thumbnail = bbc3_playlist.get('holdingImageURL')
1160 entries = []
1161 for bbc3_item in bbc3_playlist['items']:
1162 programme_id = bbc3_item.get('versionID')
1163 if not programme_id:
1164 continue
1165 formats, subtitles = self._download_media_selector(programme_id)
1166 entries.append({
1167 'id': programme_id,
1168 'title': playlist_title,
1169 'thumbnail': thumbnail,
1170 'timestamp': timestamp,
1171 'formats': formats,
1172 'subtitles': subtitles,
1173 })
1174 return self.playlist_result(
1175 entries, playlist_id, playlist_title, playlist_description)
1176
1177 initial_data = self._search_regex(
1178 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1179 'quoted preload state', default=None)
1180 if initial_data is None:
1181 initial_data = self._search_regex(
1182 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1183 'preload state', default={})
1184 else:
1185 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1186 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1187 if initial_data:
1188 def parse_media(media):
1189 if not media:
1190 return
1191 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1192 item_id = item.get('id')
1193 item_title = item.get('title')
1194 if not (item_id and item_title):
1195 continue
1196 formats, subtitles = self._download_media_selector(item_id)
1197 item_desc = None
1198 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1199 if blocks:
1200 summary = []
1201 for block in blocks:
1202 text = try_get(block, lambda x: x['model']['text'], compat_str)
1203 if text:
1204 summary.append(text)
1205 if summary:
1206 item_desc = '\n\n'.join(summary)
1207 item_time = None
1208 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1209 if try_get(meta, lambda x: x['label']) == 'Published':
1210 item_time = unified_timestamp(meta.get('timestamp'))
1211 break
1212 entries.append({
1213 'id': item_id,
1214 'title': item_title,
1215 'thumbnail': item.get('holdingImageUrl'),
1216 'formats': formats,
1217 'subtitles': subtitles,
1218 'timestamp': item_time,
1219 'description': strip_or_none(item_desc),
1220 })
1221 for resp in (initial_data.get('data') or {}).values():
1222 name = resp.get('name')
1223 if name == 'media-experience':
1224 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1225 elif name == 'article':
1226 for block in (try_get(resp,
1227 (lambda x: x['data']['blocks'],
1228 lambda x: x['data']['content']['model']['blocks'],),
1229 list) or []):
1230 if block.get('type') not in ['media', 'video']:
1231 continue
1232 parse_media(block.get('model'))
1233 return self.playlist_result(
1234 entries, playlist_id, playlist_title, playlist_description)
1235
1236 def extract_all(pattern):
1237 return list(filter(None, map(
1238 lambda s: self._parse_json(s, playlist_id, fatal=False),
1239 re.findall(pattern, webpage))))
1240
1241 # Multiple video article (e.g.
1242 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1243 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1244 entries = []
1245 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1246 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1247 if embed_url and re.match(EMBED_URL, embed_url):
1248 entries.append(embed_url)
1249 entries.extend(re.findall(
1250 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1251 if entries:
1252 return self.playlist_result(
1253 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1254 playlist_id, playlist_title, playlist_description)
1255
1256 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1257 medias = extract_all(r"data-media-meta='({[^']+})'")
1258
1259 if not medias:
1260 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1261 media_asset = self._search_regex(
1262 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1263 webpage, 'media asset', default=None)
1264 if media_asset:
1265 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1266 medias = []
1267 for video in media_asset_page.get('videos', {}).values():
1268 medias.extend(video.values())
1269
1270 if not medias:
1271 # Multiple video playlist with single `now playing` entry (e.g.
1272 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1273 vxp_playlist = self._parse_json(
1274 self._search_regex(
1275 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1276 webpage, 'playlist data'),
1277 playlist_id)
1278 playlist_medias = []
1279 for item in vxp_playlist:
1280 media = item.get('media')
1281 if not media:
1282 continue
1283 playlist_medias.append(media)
1284 # Download single video if found media with asset id matching the video id from URL
1285 if item.get('advert', {}).get('assetId') == playlist_id:
1286 medias = [media]
1287 break
1288 # Fallback to the whole playlist
1289 if not medias:
1290 medias = playlist_medias
1291
1292 entries = []
1293 for num, media_meta in enumerate(medias, start=1):
1294 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1295 if not formats and not self.get_param('ignore_no_formats'):
1296 continue
1297
1298 video_id = media_meta.get('externalId')
1299 if not video_id:
1300 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1301
1302 title = media_meta.get('caption')
1303 if not title:
1304 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1305
1306 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1307
1308 images = []
1309 for image in media_meta.get('images', {}).values():
1310 images.extend(image.values())
1311 if 'image' in media_meta:
1312 images.append(media_meta['image'])
1313
1314 thumbnails = [{
1315 'url': image.get('href'),
1316 'width': int_or_none(image.get('width')),
1317 'height': int_or_none(image.get('height')),
1318 } for image in images]
1319
1320 entries.append({
1321 'id': video_id,
1322 'title': title,
1323 'thumbnails': thumbnails,
1324 'duration': duration,
1325 'timestamp': timestamp,
1326 'formats': formats,
1327 'subtitles': subtitles,
1328 })
1329
1330 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1331
1332
1333 class BBCCoUkArticleIE(InfoExtractor):
1334 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1335 IE_NAME = 'bbc.co.uk:article'
1336 IE_DESC = 'BBC articles'
1337
1338 _TEST = {
1339 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1340 'info_dict': {
1341 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1342 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1343 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1344 },
1345 'playlist_count': 4,
1346 'add_ie': ['BBCCoUk'],
1347 }
1348
1349 def _real_extract(self, url):
1350 playlist_id = self._match_id(url)
1351
1352 webpage = self._download_webpage(url, playlist_id)
1353
1354 title = self._og_search_title(webpage)
1355 description = self._og_search_description(webpage).strip()
1356
1357 entries = [self.url_result(programme_url) for programme_url in re.findall(
1358 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1359
1360 return self.playlist_result(entries, playlist_id, title, description)
1361
1362
1363 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1364 def _entries(self, webpage, url, playlist_id):
1365 single_page = 'page' in compat_urlparse.parse_qs(
1366 compat_urlparse.urlparse(url).query)
1367 for page_num in itertools.count(2):
1368 for video_id in re.findall(
1369 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1370 yield self.url_result(
1371 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1372 if single_page:
1373 return
1374 next_page = self._search_regex(
1375 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1376 webpage, 'next page url', default=None, group='url')
1377 if not next_page:
1378 break
1379 webpage = self._download_webpage(
1380 compat_urlparse.urljoin(url, next_page), playlist_id,
1381 'Downloading page %d' % page_num, page_num)
1382
1383 def _real_extract(self, url):
1384 playlist_id = self._match_id(url)
1385
1386 webpage = self._download_webpage(url, playlist_id)
1387
1388 title, description = self._extract_title_and_description(webpage)
1389
1390 return self.playlist_result(
1391 self._entries(webpage, url, playlist_id),
1392 playlist_id, title, description)
1393
1394
1395 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1396 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1397
1398 @staticmethod
1399 def _get_default(episode, key, default_key='default'):
1400 return try_get(episode, lambda x: x[key][default_key])
1401
1402 def _get_description(self, data):
1403 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1404 return dict_get(synopsis, ('large', 'medium', 'small'))
1405
1406 def _fetch_page(self, programme_id, per_page, series_id, page):
1407 elements = self._get_elements(self._call_api(
1408 programme_id, per_page, page + 1, series_id))
1409 for element in elements:
1410 episode = self._get_episode(element)
1411 episode_id = episode.get('id')
1412 if not episode_id:
1413 continue
1414 thumbnail = None
1415 image = self._get_episode_image(episode)
1416 if image:
1417 thumbnail = image.replace('{recipe}', 'raw')
1418 category = self._get_default(episode, 'labels', 'category')
1419 yield {
1420 '_type': 'url',
1421 'id': episode_id,
1422 'title': self._get_episode_field(episode, 'subtitle'),
1423 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1424 'thumbnail': thumbnail,
1425 'description': self._get_description(episode),
1426 'categories': [category] if category else None,
1427 'series': self._get_episode_field(episode, 'title'),
1428 'ie_key': BBCCoUkIE.ie_key(),
1429 }
1430
1431 def _real_extract(self, url):
1432 pid = self._match_id(url)
1433 qs = parse_qs(url)
1434 series_id = qs.get('seriesId', [None])[0]
1435 page = qs.get('page', [None])[0]
1436 per_page = 36 if page else self._PAGE_SIZE
1437 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1438 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1439 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1440 return self.playlist_result(
1441 entries, pid, self._get_playlist_title(playlist_data),
1442 self._get_description(playlist_data))
1443
1444
1445 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1446 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1447 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1448 _TESTS = [{
1449 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1450 'info_dict': {
1451 'id': 'b05rcz9v',
1452 'title': 'The Disappearance',
1453 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1454 },
1455 'playlist_mincount': 8,
1456 }, {
1457 # all seasons
1458 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1459 'info_dict': {
1460 'id': 'b094m5t9',
1461 'title': 'Doctor Foster',
1462 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1463 },
1464 'playlist_mincount': 10,
1465 }, {
1466 # explicit season
1467 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1468 'info_dict': {
1469 'id': 'b094m5t9',
1470 'title': 'Doctor Foster',
1471 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1472 },
1473 'playlist_mincount': 5,
1474 }, {
1475 # all pages
1476 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1477 'info_dict': {
1478 'id': 'm0004c4v',
1479 'title': 'Beechgrove',
1480 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1481 },
1482 'playlist_mincount': 37,
1483 }, {
1484 # explicit page
1485 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1486 'info_dict': {
1487 'id': 'm0004c4v',
1488 'title': 'Beechgrove',
1489 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1490 },
1491 'playlist_mincount': 1,
1492 }]
1493 _PAGE_SIZE = 100
1494 _DESCRIPTION_KEY = 'synopsis'
1495
1496 def _get_episode_image(self, episode):
1497 return self._get_default(episode, 'image')
1498
1499 def _get_episode_field(self, episode, field):
1500 return self._get_default(episode, field)
1501
1502 @staticmethod
1503 def _get_elements(data):
1504 return data['entities']['results']
1505
1506 @staticmethod
1507 def _get_episode(element):
1508 return element.get('episode') or {}
1509
1510 def _call_api(self, pid, per_page, page=1, series_id=None):
1511 variables = {
1512 'id': pid,
1513 'page': page,
1514 'perPage': per_page,
1515 }
1516 if series_id:
1517 variables['sliceId'] = series_id
1518 return self._download_json(
1519 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1520 'Content-Type': 'application/json'
1521 }, data=json.dumps({
1522 'id': '5692d93d5aac8d796a0305e895e61551',
1523 'variables': variables,
1524 }).encode('utf-8'))['data']['programme']
1525
1526 @staticmethod
1527 def _get_playlist_data(data):
1528 return data
1529
1530 def _get_playlist_title(self, data):
1531 return self._get_default(data, 'title')
1532
1533
1534 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1535 IE_NAME = 'bbc.co.uk:iplayer:group'
1536 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1537 _TESTS = [{
1538 # Available for over a year unlike 30 days for most other programmes
1539 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1540 'info_dict': {
1541 'id': 'p02tcc32',
1542 'title': 'Bohemian Icons',
1543 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1544 },
1545 'playlist_mincount': 10,
1546 }, {
1547 # all pages
1548 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1549 'info_dict': {
1550 'id': 'p081d7j7',
1551 'title': 'Music in Scotland',
1552 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1553 },
1554 'playlist_mincount': 47,
1555 }, {
1556 # explicit page
1557 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1558 'info_dict': {
1559 'id': 'p081d7j7',
1560 'title': 'Music in Scotland',
1561 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1562 },
1563 'playlist_mincount': 11,
1564 }]
1565 _PAGE_SIZE = 200
1566 _DESCRIPTION_KEY = 'synopses'
1567
1568 def _get_episode_image(self, episode):
1569 return self._get_default(episode, 'images', 'standard')
1570
1571 def _get_episode_field(self, episode, field):
1572 return episode.get(field)
1573
1574 @staticmethod
1575 def _get_elements(data):
1576 return data['elements']
1577
1578 @staticmethod
1579 def _get_episode(element):
1580 return element
1581
1582 def _call_api(self, pid, per_page, page=1, series_id=None):
1583 return self._download_json(
1584 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1585 pid, query={
1586 'page': page,
1587 'per_page': per_page,
1588 })['group_episodes']
1589
1590 @staticmethod
1591 def _get_playlist_data(data):
1592 return data['group']
1593
1594 def _get_playlist_title(self, data):
1595 return data.get('title')
1596
1597
1598 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1599 IE_NAME = 'bbc.co.uk:playlist'
1600 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1601 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1602 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1603 _TESTS = [{
1604 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1605 'info_dict': {
1606 'id': 'b05rcz9v',
1607 'title': 'The Disappearance - Clips - BBC Four',
1608 'description': 'French thriller serial about a missing teenager.',
1609 },
1610 'playlist_mincount': 7,
1611 }, {
1612 # multipage playlist, explicit page
1613 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1614 'info_dict': {
1615 'id': 'b00mfl7n',
1616 'title': 'Frozen Planet - Clips - BBC One',
1617 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1618 },
1619 'playlist_mincount': 24,
1620 }, {
1621 # multipage playlist, all pages
1622 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1623 'info_dict': {
1624 'id': 'b00mfl7n',
1625 'title': 'Frozen Planet - Clips - BBC One',
1626 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1627 },
1628 'playlist_mincount': 142,
1629 }, {
1630 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1631 'only_matching': True,
1632 }, {
1633 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1634 'only_matching': True,
1635 }, {
1636 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1637 'only_matching': True,
1638 }]
1639
1640 def _extract_title_and_description(self, webpage):
1641 title = self._og_search_title(webpage, fatal=False)
1642 description = self._og_search_description(webpage)
1643 return title, description