]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/bbc.py
[compat] Remove more functions
[yt-dlp.git] / yt_dlp / extractor / bbc.py
1 import functools
2 import itertools
3 import json
4 import re
5 import urllib.error
6 import xml.etree.ElementTree
7
8 from .common import InfoExtractor
9 from ..compat import compat_HTTPError, compat_str, compat_urlparse
10 from ..utils import (
11 ExtractorError,
12 OnDemandPagedList,
13 clean_html,
14 dict_get,
15 float_or_none,
16 get_element_by_class,
17 int_or_none,
18 js_to_json,
19 parse_duration,
20 parse_iso8601,
21 parse_qs,
22 strip_or_none,
23 try_get,
24 unescapeHTML,
25 unified_timestamp,
26 url_or_none,
27 urlencode_postdata,
28 urljoin,
29 )
30
31
32 class BBCCoUkIE(InfoExtractor):
33 IE_NAME = 'bbc.co.uk'
34 IE_DESC = 'BBC iPlayer'
35 _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
36 _VALID_URL = r'''(?x)
37 https?://
38 (?:www\.)?bbc\.co\.uk/
39 (?:
40 programmes/(?!articles/)|
41 iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
42 music/(?:clips|audiovideo/popular)[/#]|
43 radio/player/|
44 sounds/play/|
45 events/[^/]+/play/[^/]+/
46 )
47 (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
48 ''' % _ID_REGEX
49
50 _LOGIN_URL = 'https://account.bbc.com/signin'
51 _NETRC_MACHINE = 'bbc'
52
53 _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
54 _MEDIA_SETS = [
55 # Provides HQ HLS streams with even better quality that pc mediaset but fails
56 # with geolocation in some cases when it's even not geo restricted at all (e.g.
57 # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
58 'iptv-all',
59 'pc',
60 ]
61
62 _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
63
64 _TESTS = [
65 {
66 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
67 'info_dict': {
68 'id': 'b039d07m',
69 'ext': 'flv',
70 'title': 'Kaleidoscope, Leonard Cohen',
71 'description': 'The Canadian poet and songwriter reflects on his musical career.',
72 },
73 'params': {
74 # rtmp download
75 'skip_download': True,
76 }
77 },
78 {
79 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
80 'info_dict': {
81 'id': 'b00yng1d',
82 'ext': 'flv',
83 'title': 'The Man in Black: Series 3: The Printed Name',
84 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
85 'duration': 1800,
86 },
87 'params': {
88 # rtmp download
89 'skip_download': True,
90 },
91 'skip': 'Episode is no longer available on BBC iPlayer Radio',
92 },
93 {
94 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
95 'info_dict': {
96 'id': 'b00yng1d',
97 'ext': 'flv',
98 'title': 'The Voice UK: Series 3: Blind Auditions 5',
99 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
100 'duration': 5100,
101 },
102 'params': {
103 # rtmp download
104 'skip_download': True,
105 },
106 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
107 },
108 {
109 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
110 'info_dict': {
111 'id': 'b03k3pb7',
112 'ext': 'flv',
113 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
114 'description': '2. Invasion',
115 'duration': 3600,
116 },
117 'params': {
118 # rtmp download
119 'skip_download': True,
120 },
121 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
122 }, {
123 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
124 'info_dict': {
125 'id': 'b04v209v',
126 'ext': 'flv',
127 'title': 'Pete Tong, The Essential New Tune Special',
128 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
129 'duration': 10800,
130 },
131 'params': {
132 # rtmp download
133 'skip_download': True,
134 },
135 'skip': 'Episode is no longer available on BBC iPlayer Radio',
136 }, {
137 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
138 'note': 'Audio',
139 'info_dict': {
140 'id': 'p022h44j',
141 'ext': 'flv',
142 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
143 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
144 'duration': 227,
145 },
146 'params': {
147 # rtmp download
148 'skip_download': True,
149 }
150 }, {
151 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
152 'note': 'Video',
153 'info_dict': {
154 'id': 'p025c103',
155 'ext': 'flv',
156 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
157 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
158 'duration': 226,
159 },
160 'params': {
161 # rtmp download
162 'skip_download': True,
163 }
164 }, {
165 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
166 'info_dict': {
167 'id': 'p02n76xf',
168 'ext': 'flv',
169 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
170 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
171 'duration': 3540,
172 },
173 'params': {
174 # rtmp download
175 'skip_download': True,
176 },
177 'skip': 'geolocation',
178 }, {
179 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
180 'info_dict': {
181 'id': 'b05zmgw1',
182 'ext': 'flv',
183 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
184 'title': 'Royal Academy Summer Exhibition',
185 'duration': 3540,
186 },
187 'params': {
188 # rtmp download
189 'skip_download': True,
190 },
191 'skip': 'geolocation',
192 }, {
193 # iptv-all mediaset fails with geolocation however there is no geo restriction
194 # for this programme at all
195 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
196 'info_dict': {
197 'id': 'b06rkms3',
198 'ext': 'flv',
199 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
200 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
201 },
202 'params': {
203 # rtmp download
204 'skip_download': True,
205 },
206 'skip': 'Now it\'s really geo-restricted',
207 }, {
208 # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
209 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
210 'info_dict': {
211 'id': 'p028bfkj',
212 'ext': 'flv',
213 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
214 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
215 },
216 'params': {
217 # rtmp download
218 'skip_download': True,
219 },
220 }, {
221 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
222 'note': 'Audio',
223 'info_dict': {
224 'id': 'm0007jz9',
225 'ext': 'mp4',
226 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
227 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
228 'duration': 9840,
229 },
230 'params': {
231 # rtmp download
232 'skip_download': True,
233 }
234 }, {
235 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
236 'only_matching': True,
237 }, {
238 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
239 'only_matching': True,
240 }, {
241 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
242 'only_matching': True,
243 }, {
244 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
245 'only_matching': True,
246 }, {
247 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
248 'only_matching': True,
249 }, {
250 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
251 'only_matching': True,
252 }, {
253 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
254 'only_matching': True,
255 }, {
256 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
257 'only_matching': True,
258 }]
259
260 def _perform_login(self, username, password):
261 login_page = self._download_webpage(
262 self._LOGIN_URL, None, 'Downloading signin page')
263
264 login_form = self._hidden_inputs(login_page)
265
266 login_form.update({
267 'username': username,
268 'password': password,
269 })
270
271 post_url = urljoin(self._LOGIN_URL, self._search_regex(
272 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
273 'post url', default=self._LOGIN_URL, group='url'))
274
275 response, urlh = self._download_webpage_handle(
276 post_url, None, 'Logging in', data=urlencode_postdata(login_form),
277 headers={'Referer': self._LOGIN_URL})
278
279 if self._LOGIN_URL in urlh.geturl():
280 error = clean_html(get_element_by_class('form-message', response))
281 if error:
282 raise ExtractorError(
283 'Unable to login: %s' % error, expected=True)
284 raise ExtractorError('Unable to log in')
285
286 class MediaSelectionError(Exception):
287 def __init__(self, id):
288 self.id = id
289
290 def _extract_asx_playlist(self, connection, programme_id):
291 asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
292 return [ref.get('href') for ref in asx.findall('./Entry/ref')]
293
294 def _extract_items(self, playlist):
295 return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
296
297 def _extract_medias(self, media_selection):
298 error = media_selection.get('result')
299 if error:
300 raise BBCCoUkIE.MediaSelectionError(error)
301 return media_selection.get('media') or []
302
303 def _extract_connections(self, media):
304 return media.get('connection') or []
305
306 def _get_subtitles(self, media, programme_id):
307 subtitles = {}
308 for connection in self._extract_connections(media):
309 cc_url = url_or_none(connection.get('href'))
310 if not cc_url:
311 continue
312 captions = self._download_xml(
313 cc_url, programme_id, 'Downloading captions', fatal=False)
314 if not isinstance(captions, xml.etree.ElementTree.Element):
315 continue
316 subtitles['en'] = [
317 {
318 'url': connection.get('href'),
319 'ext': 'ttml',
320 },
321 ]
322 break
323 return subtitles
324
325 def _raise_extractor_error(self, media_selection_error):
326 raise ExtractorError(
327 '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
328 expected=True)
329
330 def _download_media_selector(self, programme_id):
331 last_exception = None
332 for media_set in self._MEDIA_SETS:
333 try:
334 return self._download_media_selector_url(
335 self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
336 except BBCCoUkIE.MediaSelectionError as e:
337 if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
338 last_exception = e
339 continue
340 self._raise_extractor_error(e)
341 self._raise_extractor_error(last_exception)
342
343 def _download_media_selector_url(self, url, programme_id=None):
344 media_selection = self._download_json(
345 url, programme_id, 'Downloading media selection JSON',
346 expected_status=(403, 404))
347 return self._process_media_selector(media_selection, programme_id)
348
349 def _process_media_selector(self, media_selection, programme_id):
350 formats = []
351 subtitles = None
352 urls = []
353
354 for media in self._extract_medias(media_selection):
355 kind = media.get('kind')
356 if kind in ('video', 'audio'):
357 bitrate = int_or_none(media.get('bitrate'))
358 encoding = media.get('encoding')
359 width = int_or_none(media.get('width'))
360 height = int_or_none(media.get('height'))
361 file_size = int_or_none(media.get('media_file_size'))
362 for connection in self._extract_connections(media):
363 href = connection.get('href')
364 if href in urls:
365 continue
366 if href:
367 urls.append(href)
368 conn_kind = connection.get('kind')
369 protocol = connection.get('protocol')
370 supplier = connection.get('supplier')
371 transfer_format = connection.get('transferFormat')
372 format_id = supplier or conn_kind or protocol
373 # ASX playlist
374 if supplier == 'asx':
375 for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
376 formats.append({
377 'url': ref,
378 'format_id': 'ref%s_%s' % (i, format_id),
379 })
380 elif transfer_format == 'dash':
381 formats.extend(self._extract_mpd_formats(
382 href, programme_id, mpd_id=format_id, fatal=False))
383 elif transfer_format == 'hls':
384 # TODO: let expected_status be passed into _extract_xxx_formats() instead
385 try:
386 fmts = self._extract_m3u8_formats(
387 href, programme_id, ext='mp4', entry_protocol='m3u8_native',
388 m3u8_id=format_id, fatal=False)
389 except ExtractorError as e:
390 if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
391 and e.exc_info[1].code in (403, 404)):
392 raise
393 fmts = []
394 formats.extend(fmts)
395 elif transfer_format == 'hds':
396 formats.extend(self._extract_f4m_formats(
397 href, programme_id, f4m_id=format_id, fatal=False))
398 else:
399 if not supplier and bitrate:
400 format_id += '-%d' % bitrate
401 fmt = {
402 'format_id': format_id,
403 'filesize': file_size,
404 }
405 if kind == 'video':
406 fmt.update({
407 'width': width,
408 'height': height,
409 'tbr': bitrate,
410 'vcodec': encoding,
411 })
412 else:
413 fmt.update({
414 'abr': bitrate,
415 'acodec': encoding,
416 'vcodec': 'none',
417 })
418 if protocol in ('http', 'https'):
419 # Direct link
420 fmt.update({
421 'url': href,
422 })
423 elif protocol == 'rtmp':
424 application = connection.get('application', 'ondemand')
425 auth_string = connection.get('authString')
426 identifier = connection.get('identifier')
427 server = connection.get('server')
428 fmt.update({
429 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
430 'play_path': identifier,
431 'app': '%s?%s' % (application, auth_string),
432 'page_url': 'http://www.bbc.co.uk',
433 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
434 'rtmp_live': False,
435 'ext': 'flv',
436 })
437 else:
438 continue
439 formats.append(fmt)
440 elif kind == 'captions':
441 subtitles = self.extract_subtitles(media, programme_id)
442 return formats, subtitles
443
444 def _download_playlist(self, playlist_id):
445 try:
446 playlist = self._download_json(
447 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
448 playlist_id, 'Downloading playlist JSON')
449 formats = []
450 subtitles = {}
451
452 for version in playlist.get('allAvailableVersions', []):
453 smp_config = version['smpConfig']
454 title = smp_config['title']
455 description = smp_config['summary']
456 for item in smp_config['items']:
457 kind = item['kind']
458 if kind not in ('programme', 'radioProgramme'):
459 continue
460 programme_id = item.get('vpid')
461 duration = int_or_none(item.get('duration'))
462 version_formats, version_subtitles = self._download_media_selector(programme_id)
463 types = version['types']
464 for f in version_formats:
465 f['format_note'] = ', '.join(types)
466 if any('AudioDescribed' in x for x in types):
467 f['language_preference'] = -10
468 formats += version_formats
469 for tag, subformats in (version_subtitles or {}).items():
470 subtitles.setdefault(tag, []).extend(subformats)
471
472 return programme_id, title, description, duration, formats, subtitles
473 except ExtractorError as ee:
474 if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
475 raise
476
477 # fallback to legacy playlist
478 return self._process_legacy_playlist(playlist_id)
479
480 def _process_legacy_playlist_url(self, url, display_id):
481 playlist = self._download_legacy_playlist_url(url, display_id)
482 return self._extract_from_legacy_playlist(playlist, display_id)
483
484 def _process_legacy_playlist(self, playlist_id):
485 return self._process_legacy_playlist_url(
486 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
487
488 def _download_legacy_playlist_url(self, url, playlist_id=None):
489 return self._download_xml(
490 url, playlist_id, 'Downloading legacy playlist XML')
491
492 def _extract_from_legacy_playlist(self, playlist, playlist_id):
493 no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
494 if no_items is not None:
495 reason = no_items.get('reason')
496 if reason == 'preAvailability':
497 msg = 'Episode %s is not yet available' % playlist_id
498 elif reason == 'postAvailability':
499 msg = 'Episode %s is no longer available' % playlist_id
500 elif reason == 'noMedia':
501 msg = 'Episode %s is not currently available' % playlist_id
502 else:
503 msg = 'Episode %s is not available: %s' % (playlist_id, reason)
504 raise ExtractorError(msg, expected=True)
505
506 for item in self._extract_items(playlist):
507 kind = item.get('kind')
508 if kind not in ('programme', 'radioProgramme'):
509 continue
510 title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
511 description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
512 description = description_el.text if description_el is not None else None
513
514 def get_programme_id(item):
515 def get_from_attributes(item):
516 for p in ('identifier', 'group'):
517 value = item.get(p)
518 if value and re.match(r'^[pb][\da-z]{7}$', value):
519 return value
520 get_from_attributes(item)
521 mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
522 if mediator is not None:
523 return get_from_attributes(mediator)
524
525 programme_id = get_programme_id(item)
526 duration = int_or_none(item.get('duration'))
527
528 if programme_id:
529 formats, subtitles = self._download_media_selector(programme_id)
530 else:
531 formats, subtitles = self._process_media_selector(item, playlist_id)
532 programme_id = playlist_id
533
534 return programme_id, title, description, duration, formats, subtitles
535
536 def _real_extract(self, url):
537 group_id = self._match_id(url)
538
539 webpage = self._download_webpage(url, group_id, 'Downloading video page')
540
541 error = self._search_regex(
542 r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
543 webpage, 'error', default=None)
544 if error:
545 raise ExtractorError(error, expected=True)
546
547 programme_id = None
548 duration = None
549
550 tviplayer = self._search_regex(
551 r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
552 webpage, 'player', default=None)
553
554 if tviplayer:
555 player = self._parse_json(tviplayer, group_id).get('player', {})
556 duration = int_or_none(player.get('duration'))
557 programme_id = player.get('vpid')
558
559 if not programme_id:
560 programme_id = self._search_regex(
561 r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
562
563 if programme_id:
564 formats, subtitles = self._download_media_selector(programme_id)
565 title = self._og_search_title(webpage, default=None) or self._html_search_regex(
566 (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
567 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
568 description = self._search_regex(
569 (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
570 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
571 webpage, 'description', default=None)
572 if not description:
573 description = self._html_search_meta('description', webpage)
574 else:
575 programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
576
577 self._sort_formats(formats)
578
579 return {
580 'id': programme_id,
581 'title': title,
582 'description': description,
583 'thumbnail': self._og_search_thumbnail(webpage, default=None),
584 'duration': duration,
585 'formats': formats,
586 'subtitles': subtitles,
587 }
588
589
590 class BBCIE(BBCCoUkIE):
591 IE_NAME = 'bbc'
592 IE_DESC = 'BBC'
593 _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
594
595 _MEDIA_SETS = [
596 'pc',
597 'mobile-tablet-main',
598 ]
599
600 _TESTS = [{
601 # article with multiple videos embedded with data-playable containing vpids
602 'url': 'http://www.bbc.com/news/world-europe-32668511',
603 'info_dict': {
604 'id': 'world-europe-32668511',
605 'title': 'Russia stages massive WW2 parade',
606 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
607 },
608 'playlist_count': 2,
609 }, {
610 # article with multiple videos embedded with data-playable (more videos)
611 'url': 'http://www.bbc.com/news/business-28299555',
612 'info_dict': {
613 'id': 'business-28299555',
614 'title': 'Farnborough Airshow: Video highlights',
615 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
616 },
617 'playlist_count': 9,
618 'skip': 'Save time',
619 }, {
620 # article with multiple videos embedded with `new SMP()`
621 # broken
622 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
623 'info_dict': {
624 'id': '3662a707-0af9-3149-963f-47bea720b460',
625 'title': 'BUGGER',
626 },
627 'playlist_count': 18,
628 }, {
629 # single video embedded with data-playable containing vpid
630 'url': 'http://www.bbc.com/news/world-europe-32041533',
631 'info_dict': {
632 'id': 'p02mprgb',
633 'ext': 'mp4',
634 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
635 'description': 'md5:2868290467291b37feda7863f7a83f54',
636 'duration': 47,
637 'timestamp': 1427219242,
638 'upload_date': '20150324',
639 },
640 'params': {
641 # rtmp download
642 'skip_download': True,
643 }
644 }, {
645 # article with single video embedded with data-playable containing XML playlist
646 # with direct video links as progressiveDownloadUrl (for now these are extracted)
647 # and playlist with f4m and m3u8 as streamingUrl
648 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
649 'info_dict': {
650 'id': '150615_telabyad_kentin_cogu',
651 'ext': 'mp4',
652 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
653 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
654 'timestamp': 1434397334,
655 'upload_date': '20150615',
656 },
657 'params': {
658 'skip_download': True,
659 }
660 }, {
661 # single video embedded with data-playable containing XML playlists (regional section)
662 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
663 'info_dict': {
664 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
665 'ext': 'mp4',
666 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
667 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
668 'timestamp': 1434713142,
669 'upload_date': '20150619',
670 },
671 'params': {
672 'skip_download': True,
673 }
674 }, {
675 # single video from video playlist embedded with vxp-playlist-data JSON
676 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
677 'info_dict': {
678 'id': 'p02w6qjc',
679 'ext': 'mp4',
680 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
681 'duration': 56,
682 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
683 },
684 'params': {
685 'skip_download': True,
686 }
687 }, {
688 # single video story with digitalData
689 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
690 'info_dict': {
691 'id': 'p02q6gc4',
692 'ext': 'flv',
693 'title': 'Sri Lanka’s spicy secret',
694 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
695 'timestamp': 1437674293,
696 'upload_date': '20150723',
697 },
698 'params': {
699 # rtmp download
700 'skip_download': True,
701 }
702 }, {
703 # single video story without digitalData
704 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
705 'info_dict': {
706 'id': 'p018zqqg',
707 'ext': 'mp4',
708 'title': 'Hyundai Santa Fe Sport: Rock star',
709 'description': 'md5:b042a26142c4154a6e472933cf20793d',
710 'timestamp': 1415867444,
711 'upload_date': '20141113',
712 },
713 'params': {
714 # rtmp download
715 'skip_download': True,
716 }
717 }, {
718 # single video embedded with Morph
719 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
720 'info_dict': {
721 'id': 'p041vhd0',
722 'ext': 'mp4',
723 'title': "Nigeria v Japan - Men's First Round",
724 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
725 'duration': 7980,
726 'uploader': 'BBC Sport',
727 'uploader_id': 'bbc_sport',
728 },
729 'params': {
730 # m3u8 download
731 'skip_download': True,
732 },
733 'skip': 'Georestricted to UK',
734 }, {
735 # single video with playlist.sxml URL in playlist param
736 'url': 'http://www.bbc.com/sport/0/football/33653409',
737 'info_dict': {
738 'id': 'p02xycnp',
739 'ext': 'mp4',
740 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
741 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
742 'duration': 140,
743 },
744 'params': {
745 # rtmp download
746 'skip_download': True,
747 }
748 }, {
749 # article with multiple videos embedded with playlist.sxml in playlist param
750 'url': 'http://www.bbc.com/sport/0/football/34475836',
751 'info_dict': {
752 'id': '34475836',
753 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
754 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
755 },
756 'playlist_count': 3,
757 }, {
758 # school report article with single video
759 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
760 'info_dict': {
761 'id': '35744779',
762 'title': 'School which breaks down barriers in Jerusalem',
763 },
764 'playlist_count': 1,
765 }, {
766 # single video with playlist URL from weather section
767 'url': 'http://www.bbc.com/weather/features/33601775',
768 'only_matching': True,
769 }, {
770 # custom redirection to www.bbc.com
771 # also, video with window.__INITIAL_DATA__
772 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
773 'info_dict': {
774 'id': 'p02xzws1',
775 'ext': 'mp4',
776 'title': "Pluto may have 'nitrogen glaciers'",
777 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
778 'thumbnail': r're:https?://.+/.+\.jpg',
779 'timestamp': 1437785037,
780 'upload_date': '20150725',
781 },
782 }, {
783 # video with window.__INITIAL_DATA__ and value as JSON string
784 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
785 'info_dict': {
786 'id': 'p0b71qth',
787 'ext': 'mp4',
788 'title': 'Why France is making this woman a national hero',
789 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
790 'thumbnail': r're:https?://.+/.+\.jpg',
791 'timestamp': 1638230731,
792 'upload_date': '20211130',
793 },
794 }, {
795 # single video article embedded with data-media-vpid
796 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
797 'only_matching': True,
798 }, {
799 # bbcthreeConfig
800 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
801 'info_dict': {
802 'id': 'p06556y7',
803 'ext': 'mp4',
804 'title': 'Things Not To Say to people that live on council estates',
805 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
806 'duration': 360,
807 'thumbnail': r're:https?://.+/.+\.jpg',
808 },
809 }, {
810 # window.__PRELOADED_STATE__
811 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
812 'info_dict': {
813 'id': 'b0b9z4vz',
814 'ext': 'mp4',
815 'title': 'Prom 6: An American in Paris and Turangalila',
816 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
817 'uploader': 'Radio 3',
818 'uploader_id': 'bbc_radio_three',
819 },
820 }, {
821 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
822 'info_dict': {
823 'id': 'p06w9tws',
824 'ext': 'mp4',
825 'title': 'md5:2fabf12a726603193a2879a055f72514',
826 'description': 'Learn English words and phrases from this story',
827 },
828 'add_ie': [BBCCoUkIE.ie_key()],
829 }, {
830 # BBC Reel
831 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
832 'info_dict': {
833 'id': 'p07c6sb9',
834 'ext': 'mp4',
835 'title': 'How positive thinking is harming your happiness',
836 'alt_title': 'The downsides of positive thinking',
837 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
838 'duration': 235,
839 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
840 'upload_date': '20190604',
841 'categories': ['Psychology'],
842 },
843 }]
844
845 @classmethod
846 def suitable(cls, url):
847 EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
848 return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
849 else super(BBCIE, cls).suitable(url))
850
851 def _extract_from_media_meta(self, media_meta, video_id):
852 # Direct links to media in media metadata (e.g.
853 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
854 # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
855 source_files = media_meta.get('sourceFiles')
856 if source_files:
857 return [{
858 'url': f['url'],
859 'format_id': format_id,
860 'ext': f.get('encoding'),
861 'tbr': float_or_none(f.get('bitrate'), 1000),
862 'filesize': int_or_none(f.get('filesize')),
863 } for format_id, f in source_files.items() if f.get('url')], []
864
865 programme_id = media_meta.get('externalId')
866 if programme_id:
867 return self._download_media_selector(programme_id)
868
869 # Process playlist.sxml as legacy playlist
870 href = media_meta.get('href')
871 if href:
872 playlist = self._download_legacy_playlist_url(href)
873 _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
874 return formats, subtitles
875
876 return [], []
877
878 def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
879 programme_id, title, description, duration, formats, subtitles = \
880 self._process_legacy_playlist_url(url, playlist_id)
881 self._sort_formats(formats)
882 return {
883 'id': programme_id,
884 'title': title,
885 'description': description,
886 'duration': duration,
887 'timestamp': timestamp,
888 'formats': formats,
889 'subtitles': subtitles,
890 }
891
892 def _real_extract(self, url):
893 playlist_id = self._match_id(url)
894
895 webpage = self._download_webpage(url, playlist_id)
896
897 json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
898 timestamp = json_ld_info.get('timestamp')
899
900 playlist_title = json_ld_info.get('title')
901 if not playlist_title:
902 playlist_title = (self._og_search_title(webpage, default=None)
903 or self._html_extract_title(webpage, 'playlist title', default=None))
904 if playlist_title:
905 playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
906
907 playlist_description = json_ld_info.get(
908 'description') or self._og_search_description(webpage, default=None)
909
910 if not timestamp:
911 timestamp = parse_iso8601(self._search_regex(
912 [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
913 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
914 r'"datePublished":\s*"([^"]+)'],
915 webpage, 'date', default=None))
916
917 entries = []
918
919 # article with multiple videos embedded with playlist.sxml (e.g.
920 # http://www.bbc.com/sport/0/football/34475836)
921 playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
922 playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
923 if playlists:
924 entries = [
925 self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
926 for playlist_url in playlists]
927
928 # news article with multiple videos embedded with data-playable
929 data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
930 if data_playables:
931 for _, data_playable_json in data_playables:
932 data_playable = self._parse_json(
933 unescapeHTML(data_playable_json), playlist_id, fatal=False)
934 if not data_playable:
935 continue
936 settings = data_playable.get('settings', {})
937 if settings:
938 # data-playable with video vpid in settings.playlistObject.items (e.g.
939 # http://www.bbc.com/news/world-us-canada-34473351)
940 playlist_object = settings.get('playlistObject', {})
941 if playlist_object:
942 items = playlist_object.get('items')
943 if items and isinstance(items, list):
944 title = playlist_object['title']
945 description = playlist_object.get('summary')
946 duration = int_or_none(items[0].get('duration'))
947 programme_id = items[0].get('vpid')
948 formats, subtitles = self._download_media_selector(programme_id)
949 self._sort_formats(formats)
950 entries.append({
951 'id': programme_id,
952 'title': title,
953 'description': description,
954 'timestamp': timestamp,
955 'duration': duration,
956 'formats': formats,
957 'subtitles': subtitles,
958 })
959 else:
960 # data-playable without vpid but with a playlist.sxml URLs
961 # in otherSettings.playlist (e.g.
962 # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
963 playlist = data_playable.get('otherSettings', {}).get('playlist', {})
964 if playlist:
965 entry = None
966 for key in ('streaming', 'progressiveDownload'):
967 playlist_url = playlist.get('%sUrl' % key)
968 if not playlist_url:
969 continue
970 try:
971 info = self._extract_from_playlist_sxml(
972 playlist_url, playlist_id, timestamp)
973 if not entry:
974 entry = info
975 else:
976 entry['title'] = info['title']
977 entry['formats'].extend(info['formats'])
978 except ExtractorError as e:
979 # Some playlist URL may fail with 500, at the same time
980 # the other one may work fine (e.g.
981 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
982 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
983 continue
984 raise
985 if entry:
986 self._sort_formats(entry['formats'])
987 entries.append(entry)
988
989 if entries:
990 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
991
992 # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
993 group_id = self._search_regex(
994 r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
995 webpage, 'group id', default=None)
996 if group_id:
997 return self.url_result(
998 'https://www.bbc.co.uk/programmes/%s' % group_id,
999 ie=BBCCoUkIE.ie_key())
1000
1001 # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
1002 programme_id = self._search_regex(
1003 [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
1004 r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
1005 r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
1006 webpage, 'vpid', default=None)
1007
1008 if programme_id:
1009 formats, subtitles = self._download_media_selector(programme_id)
1010 self._sort_formats(formats)
1011 # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
1012 digital_data = self._parse_json(
1013 self._search_regex(
1014 r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
1015 programme_id, fatal=False)
1016 page_info = digital_data.get('page', {}).get('pageInfo', {})
1017 title = page_info.get('pageName') or self._og_search_title(webpage)
1018 description = page_info.get('description') or self._og_search_description(webpage)
1019 timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
1020 return {
1021 'id': programme_id,
1022 'title': title,
1023 'description': description,
1024 'timestamp': timestamp,
1025 'formats': formats,
1026 'subtitles': subtitles,
1027 }
1028
1029 # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
1030 initial_data = self._parse_json(self._html_search_regex(
1031 r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
1032 webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
1033 if initial_data:
1034 init_data = try_get(
1035 initial_data, lambda x: x['initData']['items'][0], dict) or {}
1036 smp_data = init_data.get('smpData') or {}
1037 clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
1038 version_id = clip_data.get('versionID')
1039 if version_id:
1040 title = smp_data['title']
1041 formats, subtitles = self._download_media_selector(version_id)
1042 self._sort_formats(formats)
1043 image_url = smp_data.get('holdingImageURL')
1044 display_date = init_data.get('displayDate')
1045 topic_title = init_data.get('topicTitle')
1046
1047 return {
1048 'id': version_id,
1049 'title': title,
1050 'formats': formats,
1051 'alt_title': init_data.get('shortTitle'),
1052 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
1053 'description': smp_data.get('summary') or init_data.get('shortSummary'),
1054 'upload_date': display_date.replace('-', '') if display_date else None,
1055 'subtitles': subtitles,
1056 'duration': int_or_none(clip_data.get('duration')),
1057 'categories': [topic_title] if topic_title else None,
1058 }
1059
1060 # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
1061 # There are several setPayload calls may be present but the video
1062 # seems to be always related to the first one
1063 morph_payload = self._parse_json(
1064 self._search_regex(
1065 r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
1066 webpage, 'morph payload', default='{}'),
1067 playlist_id, fatal=False)
1068 if morph_payload:
1069 components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
1070 for component in components:
1071 if not isinstance(component, dict):
1072 continue
1073 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
1074 if not lead_media:
1075 continue
1076 identifiers = lead_media.get('identifiers')
1077 if not identifiers or not isinstance(identifiers, dict):
1078 continue
1079 programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
1080 if not programme_id:
1081 continue
1082 title = lead_media.get('title') or self._og_search_title(webpage)
1083 formats, subtitles = self._download_media_selector(programme_id)
1084 self._sort_formats(formats)
1085 description = lead_media.get('summary')
1086 uploader = lead_media.get('masterBrand')
1087 uploader_id = lead_media.get('mid')
1088 duration = None
1089 duration_d = lead_media.get('duration')
1090 if isinstance(duration_d, dict):
1091 duration = parse_duration(dict_get(
1092 duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
1093 return {
1094 'id': programme_id,
1095 'title': title,
1096 'description': description,
1097 'duration': duration,
1098 'uploader': uploader,
1099 'uploader_id': uploader_id,
1100 'formats': formats,
1101 'subtitles': subtitles,
1102 }
1103
1104 preload_state = self._parse_json(self._search_regex(
1105 r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
1106 'preload state', default='{}'), playlist_id, fatal=False)
1107 if preload_state:
1108 current_programme = preload_state.get('programmes', {}).get('current') or {}
1109 programme_id = current_programme.get('id')
1110 if current_programme and programme_id and current_programme.get('type') == 'playable_item':
1111 title = current_programme.get('titles', {}).get('tertiary') or playlist_title
1112 formats, subtitles = self._download_media_selector(programme_id)
1113 self._sort_formats(formats)
1114 synopses = current_programme.get('synopses') or {}
1115 network = current_programme.get('network') or {}
1116 duration = int_or_none(
1117 current_programme.get('duration', {}).get('value'))
1118 thumbnail = None
1119 image_url = current_programme.get('image_url')
1120 if image_url:
1121 thumbnail = image_url.replace('{recipe}', 'raw')
1122 return {
1123 'id': programme_id,
1124 'title': title,
1125 'description': dict_get(synopses, ('long', 'medium', 'short')),
1126 'thumbnail': thumbnail,
1127 'duration': duration,
1128 'uploader': network.get('short_title'),
1129 'uploader_id': network.get('id'),
1130 'formats': formats,
1131 'subtitles': subtitles,
1132 }
1133
1134 bbc3_config = self._parse_json(
1135 self._search_regex(
1136 r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
1137 'bbcthree config', default='{}'),
1138 playlist_id, transform_source=js_to_json, fatal=False) or {}
1139 payload = bbc3_config.get('payload') or {}
1140 if payload:
1141 clip = payload.get('currentClip') or {}
1142 clip_vpid = clip.get('vpid')
1143 clip_title = clip.get('title')
1144 if clip_vpid and clip_title:
1145 formats, subtitles = self._download_media_selector(clip_vpid)
1146 self._sort_formats(formats)
1147 return {
1148 'id': clip_vpid,
1149 'title': clip_title,
1150 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
1151 'description': clip.get('description'),
1152 'duration': parse_duration(clip.get('duration')),
1153 'formats': formats,
1154 'subtitles': subtitles,
1155 }
1156 bbc3_playlist = try_get(
1157 payload, lambda x: x['content']['bbcMedia']['playlist'],
1158 dict)
1159 if bbc3_playlist:
1160 playlist_title = bbc3_playlist.get('title') or playlist_title
1161 thumbnail = bbc3_playlist.get('holdingImageURL')
1162 entries = []
1163 for bbc3_item in bbc3_playlist['items']:
1164 programme_id = bbc3_item.get('versionID')
1165 if not programme_id:
1166 continue
1167 formats, subtitles = self._download_media_selector(programme_id)
1168 self._sort_formats(formats)
1169 entries.append({
1170 'id': programme_id,
1171 'title': playlist_title,
1172 'thumbnail': thumbnail,
1173 'timestamp': timestamp,
1174 'formats': formats,
1175 'subtitles': subtitles,
1176 })
1177 return self.playlist_result(
1178 entries, playlist_id, playlist_title, playlist_description)
1179
1180 initial_data = self._search_regex(
1181 r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
1182 'quoted preload state', default=None)
1183 if initial_data is None:
1184 initial_data = self._search_regex(
1185 r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
1186 'preload state', default={})
1187 else:
1188 initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
1189 initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
1190 if initial_data:
1191 def parse_media(media):
1192 if not media:
1193 return
1194 for item in (try_get(media, lambda x: x['media']['items'], list) or []):
1195 item_id = item.get('id')
1196 item_title = item.get('title')
1197 if not (item_id and item_title):
1198 continue
1199 formats, subtitles = self._download_media_selector(item_id)
1200 self._sort_formats(formats)
1201 item_desc = None
1202 blocks = try_get(media, lambda x: x['summary']['blocks'], list)
1203 if blocks:
1204 summary = []
1205 for block in blocks:
1206 text = try_get(block, lambda x: x['model']['text'], compat_str)
1207 if text:
1208 summary.append(text)
1209 if summary:
1210 item_desc = '\n\n'.join(summary)
1211 item_time = None
1212 for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
1213 if try_get(meta, lambda x: x['label']) == 'Published':
1214 item_time = unified_timestamp(meta.get('timestamp'))
1215 break
1216 entries.append({
1217 'id': item_id,
1218 'title': item_title,
1219 'thumbnail': item.get('holdingImageUrl'),
1220 'formats': formats,
1221 'subtitles': subtitles,
1222 'timestamp': item_time,
1223 'description': strip_or_none(item_desc),
1224 })
1225 for resp in (initial_data.get('data') or {}).values():
1226 name = resp.get('name')
1227 if name == 'media-experience':
1228 parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
1229 elif name == 'article':
1230 for block in (try_get(resp,
1231 (lambda x: x['data']['blocks'],
1232 lambda x: x['data']['content']['model']['blocks'],),
1233 list) or []):
1234 if block.get('type') != 'media':
1235 continue
1236 parse_media(block.get('model'))
1237 return self.playlist_result(
1238 entries, playlist_id, playlist_title, playlist_description)
1239
1240 def extract_all(pattern):
1241 return list(filter(None, map(
1242 lambda s: self._parse_json(s, playlist_id, fatal=False),
1243 re.findall(pattern, webpage))))
1244
1245 # Multiple video article (e.g.
1246 # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
1247 EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
1248 entries = []
1249 for match in extract_all(r'new\s+SMP\(({.+?})\)'):
1250 embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
1251 if embed_url and re.match(EMBED_URL, embed_url):
1252 entries.append(embed_url)
1253 entries.extend(re.findall(
1254 r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
1255 if entries:
1256 return self.playlist_result(
1257 [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
1258 playlist_id, playlist_title, playlist_description)
1259
1260 # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
1261 medias = extract_all(r"data-media-meta='({[^']+})'")
1262
1263 if not medias:
1264 # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
1265 media_asset = self._search_regex(
1266 r'mediaAssetPage\.init\(\s*({.+?}), "/',
1267 webpage, 'media asset', default=None)
1268 if media_asset:
1269 media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
1270 medias = []
1271 for video in media_asset_page.get('videos', {}).values():
1272 medias.extend(video.values())
1273
1274 if not medias:
1275 # Multiple video playlist with single `now playing` entry (e.g.
1276 # http://www.bbc.com/news/video_and_audio/must_see/33767813)
1277 vxp_playlist = self._parse_json(
1278 self._search_regex(
1279 r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
1280 webpage, 'playlist data'),
1281 playlist_id)
1282 playlist_medias = []
1283 for item in vxp_playlist:
1284 media = item.get('media')
1285 if not media:
1286 continue
1287 playlist_medias.append(media)
1288 # Download single video if found media with asset id matching the video id from URL
1289 if item.get('advert', {}).get('assetId') == playlist_id:
1290 medias = [media]
1291 break
1292 # Fallback to the whole playlist
1293 if not medias:
1294 medias = playlist_medias
1295
1296 entries = []
1297 for num, media_meta in enumerate(medias, start=1):
1298 formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
1299 if not formats and not self.get_param('ignore_no_formats'):
1300 continue
1301 self._sort_formats(formats)
1302
1303 video_id = media_meta.get('externalId')
1304 if not video_id:
1305 video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
1306
1307 title = media_meta.get('caption')
1308 if not title:
1309 title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
1310
1311 duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
1312
1313 images = []
1314 for image in media_meta.get('images', {}).values():
1315 images.extend(image.values())
1316 if 'image' in media_meta:
1317 images.append(media_meta['image'])
1318
1319 thumbnails = [{
1320 'url': image.get('href'),
1321 'width': int_or_none(image.get('width')),
1322 'height': int_or_none(image.get('height')),
1323 } for image in images]
1324
1325 entries.append({
1326 'id': video_id,
1327 'title': title,
1328 'thumbnails': thumbnails,
1329 'duration': duration,
1330 'timestamp': timestamp,
1331 'formats': formats,
1332 'subtitles': subtitles,
1333 })
1334
1335 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
1336
1337
1338 class BBCCoUkArticleIE(InfoExtractor):
1339 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
1340 IE_NAME = 'bbc.co.uk:article'
1341 IE_DESC = 'BBC articles'
1342
1343 _TEST = {
1344 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
1345 'info_dict': {
1346 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
1347 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
1348 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
1349 },
1350 'playlist_count': 4,
1351 'add_ie': ['BBCCoUk'],
1352 }
1353
1354 def _real_extract(self, url):
1355 playlist_id = self._match_id(url)
1356
1357 webpage = self._download_webpage(url, playlist_id)
1358
1359 title = self._og_search_title(webpage)
1360 description = self._og_search_description(webpage).strip()
1361
1362 entries = [self.url_result(programme_url) for programme_url in re.findall(
1363 r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
1364
1365 return self.playlist_result(entries, playlist_id, title, description)
1366
1367
1368 class BBCCoUkPlaylistBaseIE(InfoExtractor):
1369 def _entries(self, webpage, url, playlist_id):
1370 single_page = 'page' in compat_urlparse.parse_qs(
1371 compat_urlparse.urlparse(url).query)
1372 for page_num in itertools.count(2):
1373 for video_id in re.findall(
1374 self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
1375 yield self.url_result(
1376 self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
1377 if single_page:
1378 return
1379 next_page = self._search_regex(
1380 r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
1381 webpage, 'next page url', default=None, group='url')
1382 if not next_page:
1383 break
1384 webpage = self._download_webpage(
1385 compat_urlparse.urljoin(url, next_page), playlist_id,
1386 'Downloading page %d' % page_num, page_num)
1387
1388 def _real_extract(self, url):
1389 playlist_id = self._match_id(url)
1390
1391 webpage = self._download_webpage(url, playlist_id)
1392
1393 title, description = self._extract_title_and_description(webpage)
1394
1395 return self.playlist_result(
1396 self._entries(webpage, url, playlist_id),
1397 playlist_id, title, description)
1398
1399
1400 class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
1401 _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
1402
1403 @staticmethod
1404 def _get_default(episode, key, default_key='default'):
1405 return try_get(episode, lambda x: x[key][default_key])
1406
1407 def _get_description(self, data):
1408 synopsis = data.get(self._DESCRIPTION_KEY) or {}
1409 return dict_get(synopsis, ('large', 'medium', 'small'))
1410
1411 def _fetch_page(self, programme_id, per_page, series_id, page):
1412 elements = self._get_elements(self._call_api(
1413 programme_id, per_page, page + 1, series_id))
1414 for element in elements:
1415 episode = self._get_episode(element)
1416 episode_id = episode.get('id')
1417 if not episode_id:
1418 continue
1419 thumbnail = None
1420 image = self._get_episode_image(episode)
1421 if image:
1422 thumbnail = image.replace('{recipe}', 'raw')
1423 category = self._get_default(episode, 'labels', 'category')
1424 yield {
1425 '_type': 'url',
1426 'id': episode_id,
1427 'title': self._get_episode_field(episode, 'subtitle'),
1428 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
1429 'thumbnail': thumbnail,
1430 'description': self._get_description(episode),
1431 'categories': [category] if category else None,
1432 'series': self._get_episode_field(episode, 'title'),
1433 'ie_key': BBCCoUkIE.ie_key(),
1434 }
1435
1436 def _real_extract(self, url):
1437 pid = self._match_id(url)
1438 qs = parse_qs(url)
1439 series_id = qs.get('seriesId', [None])[0]
1440 page = qs.get('page', [None])[0]
1441 per_page = 36 if page else self._PAGE_SIZE
1442 fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
1443 entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
1444 playlist_data = self._get_playlist_data(self._call_api(pid, 1))
1445 return self.playlist_result(
1446 entries, pid, self._get_playlist_title(playlist_data),
1447 self._get_description(playlist_data))
1448
1449
1450 class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
1451 IE_NAME = 'bbc.co.uk:iplayer:episodes'
1452 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
1453 _TESTS = [{
1454 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
1455 'info_dict': {
1456 'id': 'b05rcz9v',
1457 'title': 'The Disappearance',
1458 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
1459 },
1460 'playlist_mincount': 8,
1461 }, {
1462 # all seasons
1463 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
1464 'info_dict': {
1465 'id': 'b094m5t9',
1466 'title': 'Doctor Foster',
1467 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1468 },
1469 'playlist_mincount': 10,
1470 }, {
1471 # explicit season
1472 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
1473 'info_dict': {
1474 'id': 'b094m5t9',
1475 'title': 'Doctor Foster',
1476 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
1477 },
1478 'playlist_mincount': 5,
1479 }, {
1480 # all pages
1481 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
1482 'info_dict': {
1483 'id': 'm0004c4v',
1484 'title': 'Beechgrove',
1485 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1486 },
1487 'playlist_mincount': 37,
1488 }, {
1489 # explicit page
1490 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
1491 'info_dict': {
1492 'id': 'm0004c4v',
1493 'title': 'Beechgrove',
1494 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
1495 },
1496 'playlist_mincount': 1,
1497 }]
1498 _PAGE_SIZE = 100
1499 _DESCRIPTION_KEY = 'synopsis'
1500
1501 def _get_episode_image(self, episode):
1502 return self._get_default(episode, 'image')
1503
1504 def _get_episode_field(self, episode, field):
1505 return self._get_default(episode, field)
1506
1507 @staticmethod
1508 def _get_elements(data):
1509 return data['entities']['results']
1510
1511 @staticmethod
1512 def _get_episode(element):
1513 return element.get('episode') or {}
1514
1515 def _call_api(self, pid, per_page, page=1, series_id=None):
1516 variables = {
1517 'id': pid,
1518 'page': page,
1519 'perPage': per_page,
1520 }
1521 if series_id:
1522 variables['sliceId'] = series_id
1523 return self._download_json(
1524 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
1525 'Content-Type': 'application/json'
1526 }, data=json.dumps({
1527 'id': '5692d93d5aac8d796a0305e895e61551',
1528 'variables': variables,
1529 }).encode('utf-8'))['data']['programme']
1530
1531 @staticmethod
1532 def _get_playlist_data(data):
1533 return data
1534
1535 def _get_playlist_title(self, data):
1536 return self._get_default(data, 'title')
1537
1538
1539 class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
1540 IE_NAME = 'bbc.co.uk:iplayer:group'
1541 _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
1542 _TESTS = [{
1543 # Available for over a year unlike 30 days for most other programmes
1544 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
1545 'info_dict': {
1546 'id': 'p02tcc32',
1547 'title': 'Bohemian Icons',
1548 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
1549 },
1550 'playlist_mincount': 10,
1551 }, {
1552 # all pages
1553 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
1554 'info_dict': {
1555 'id': 'p081d7j7',
1556 'title': 'Music in Scotland',
1557 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1558 },
1559 'playlist_mincount': 47,
1560 }, {
1561 # explicit page
1562 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
1563 'info_dict': {
1564 'id': 'p081d7j7',
1565 'title': 'Music in Scotland',
1566 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
1567 },
1568 'playlist_mincount': 11,
1569 }]
1570 _PAGE_SIZE = 200
1571 _DESCRIPTION_KEY = 'synopses'
1572
1573 def _get_episode_image(self, episode):
1574 return self._get_default(episode, 'images', 'standard')
1575
1576 def _get_episode_field(self, episode, field):
1577 return episode.get(field)
1578
1579 @staticmethod
1580 def _get_elements(data):
1581 return data['elements']
1582
1583 @staticmethod
1584 def _get_episode(element):
1585 return element
1586
1587 def _call_api(self, pid, per_page, page=1, series_id=None):
1588 return self._download_json(
1589 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
1590 pid, query={
1591 'page': page,
1592 'per_page': per_page,
1593 })['group_episodes']
1594
1595 @staticmethod
1596 def _get_playlist_data(data):
1597 return data['group']
1598
1599 def _get_playlist_title(self, data):
1600 return data.get('title')
1601
1602
1603 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
1604 IE_NAME = 'bbc.co.uk:playlist'
1605 _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
1606 _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
1607 _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
1608 _TESTS = [{
1609 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1610 'info_dict': {
1611 'id': 'b05rcz9v',
1612 'title': 'The Disappearance - Clips - BBC Four',
1613 'description': 'French thriller serial about a missing teenager.',
1614 },
1615 'playlist_mincount': 7,
1616 }, {
1617 # multipage playlist, explicit page
1618 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
1619 'info_dict': {
1620 'id': 'b00mfl7n',
1621 'title': 'Frozen Planet - Clips - BBC One',
1622 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1623 },
1624 'playlist_mincount': 24,
1625 }, {
1626 # multipage playlist, all pages
1627 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
1628 'info_dict': {
1629 'id': 'b00mfl7n',
1630 'title': 'Frozen Planet - Clips - BBC One',
1631 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
1632 },
1633 'playlist_mincount': 142,
1634 }, {
1635 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
1636 'only_matching': True,
1637 }, {
1638 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
1639 'only_matching': True,
1640 }, {
1641 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
1642 'only_matching': True,
1643 }]
1644
1645 def _extract_title_and_description(self, webpage):
1646 title = self._og_search_title(webpage, fatal=False)
1647 description = self._og_search_description(webpage)
1648 return title, description