]>
jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/bbc.py
1 from __future__
import unicode_literals
3 import xml
. etree
. ElementTree
5 from . common
import InfoExtractor
11 from .. compat
import compat_HTTPError
15 class BBCCoUkIE ( InfoExtractor
):
17 IE_DESC
= 'BBC iPlayer'
18 _VALID_URL
= r
'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z] {8} )'
20 mediaselector_url
= 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ %s '
24 'url' : 'http://www.bbc.co.uk/programmes/b039g8p7' ,
28 'title' : 'Kaleidoscope, Leonard Cohen' ,
29 'description' : 'The Canadian poet and songwriter reflects on his musical career.' ,
34 'skip_download' : True ,
38 'url' : 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/' ,
42 'title' : 'The Man in Black: Series 3: The Printed Name' ,
43 'description' : "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey." ,
48 'skip_download' : True ,
50 'skip' : 'Episode is no longer available on BBC iPlayer Radio' ,
53 'url' : 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/' ,
57 'title' : 'The Voice UK: Series 3: Blind Auditions 5' ,
58 'description' : "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone." ,
63 'skip_download' : True ,
65 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
68 'url' : 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion' ,
72 'title' : "Tomorrow's Worlds: The Unearthly History of Science Fiction" ,
73 'description' : '2. Invasion' ,
78 'skip_download' : True ,
80 'skip' : 'Currently BBC iPlayer TV programmes are available to play in the UK only' ,
82 'url' : 'http://www.bbc.co.uk/programmes/b04v20dw' ,
86 'title' : 'Pete Tong, The Essential New Tune Special' ,
87 'description' : "Pete has a very special mix - all of 2014's Essential New Tunes!" ,
92 'skip_download' : True ,
95 'url' : 'http://www.bbc.co.uk/music/clips/p02frcc3' ,
100 'title' : 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix' ,
101 'description' : 'French house superstar Madeon takes us out of the club and onto the after party.' ,
106 'skip_download' : True ,
109 'url' : 'http://www.bbc.co.uk/music/clips/p025c0zz' ,
114 'title' : 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)' ,
115 'description' : 'Rae Morris performs Closer for BBC Three at Reading 2014' ,
120 'skip_download' : True ,
123 'url' : 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls' ,
127 'title' : 'Natural World, 2015-2016: 2. Super Powered Owls' ,
128 'description' : 'md5:e4db5c937d0e95a7c6b5e654d429183d' ,
133 'skip_download' : True ,
135 'skip' : 'geolocation' ,
137 'url' : 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition' ,
141 'description' : 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.' ,
142 'title' : 'Royal Academy Summer Exhibition' ,
147 'skip_download' : True ,
149 'skip' : 'geolocation' ,
151 'url' : 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4' ,
152 'only_matching' : True ,
154 'url' : 'http://www.bbc.co.uk/music/clips#p02frcc3' ,
155 'only_matching' : True ,
157 'url' : 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo' ,
158 'only_matching' : True ,
162 def _extract_asx_playlist ( self
, connection
, programme_id
):
163 asx
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading ASX playlist' )
164 return [ ref
. get ( 'href' ) for ref
in asx
. findall ( './Entry/ref' )]
166 def _extract_connection ( self
, connection
, programme_id
):
168 protocol
= connection
. get ( 'protocol' )
169 supplier
= connection
. get ( 'supplier' )
170 if protocol
== 'http' :
171 href
= connection
. get ( 'href' )
173 if supplier
== 'asx' :
174 for i
, ref
in enumerate ( self
._ extract
_ asx
_ playlist
( connection
, programme_id
)):
177 'format_id' : 'ref %s _ %s ' % ( i
, supplier
),
183 'format_id' : supplier
,
185 elif protocol
== 'rtmp' :
186 application
= connection
. get ( 'application' , 'ondemand' )
187 auth_string
= connection
. get ( 'authString' )
188 identifier
= connection
. get ( 'identifier' )
189 server
= connection
. get ( 'server' )
191 'url' : ' %s :// %s / %s ? %s ' % ( protocol
, server
, application
, auth_string
),
192 'play_path' : identifier
,
193 'app' : ' %s ? %s ' % ( application
, auth_string
),
194 'page_url' : 'http://www.bbc.co.uk' ,
195 'player_url' : 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf' ,
198 'format_id' : supplier
,
202 def _extract_items ( self
, playlist
):
203 return playlist
. findall ( './ {http://bbc.co.uk/2008/emp/playlist} item' )
205 def _extract_medias ( self
, media_selection
):
206 error
= media_selection
. find ( './ {http://bbc.co.uk/2008/mp/mediaselection} error' )
207 if error
is not None :
208 raise ExtractorError (
209 ' %s returned error: %s ' % ( self
. IE_NAME
, error
. get ( 'id' )), expected
= True )
210 return media_selection
. findall ( './ {http://bbc.co.uk/2008/mp/mediaselection} media' )
212 def _extract_connections ( self
, media
):
213 return media
. findall ( './ {http://bbc.co.uk/2008/mp/mediaselection} connection' )
215 def _extract_video ( self
, media
, programme_id
):
217 vbr
= int ( media
. get ( 'bitrate' ))
218 vcodec
= media
. get ( 'encoding' )
219 service
= media
. get ( 'service' )
220 width
= int ( media
. get ( 'width' ))
221 height
= int ( media
. get ( 'height' ))
222 file_size
= int ( media
. get ( 'media_file_size' ))
223 for connection
in self
._ extract
_ connections
( media
):
224 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
225 for format
in conn_formats
:
227 'format_id' : ' %s _ %s ' % ( service
, format
[ 'format_id' ]),
232 'filesize' : file_size
,
234 formats
. extend ( conn_formats
)
237 def _extract_audio ( self
, media
, programme_id
):
239 abr
= int ( media
. get ( 'bitrate' ))
240 acodec
= media
. get ( 'encoding' )
241 service
= media
. get ( 'service' )
242 for connection
in self
._ extract
_ connections
( media
):
243 conn_formats
= self
._ extract
_ connection
( connection
, programme_id
)
244 for format
in conn_formats
:
246 'format_id' : ' %s _ %s ' % ( service
, format
[ 'format_id' ]),
250 formats
. extend ( conn_formats
)
253 def _get_subtitles ( self
, media
, programme_id
):
255 for connection
in self
._ extract
_ connections
( media
):
256 captions
= self
._ download
_ xml
( connection
. get ( 'href' ), programme_id
, 'Downloading captions' )
257 lang
= captions
. get ( ' {http://www.w3.org/XML/1998/namespace} lang' , 'en' )
258 ps
= captions
. findall ( './ {0} body/ {0} div/ {0} p' . format ( ' {http://www.w3.org/2006/10/ttaf1} ' ))
261 def _extract_text ( p
):
262 if p
. text
is not None :
263 stripped_text
= p
. text
. strip ()
266 return ' ' . join ( span
. text
. strip () for span
in p
. findall ( ' {http://www.w3.org/2006/10/ttaf1} span' ))
267 for pos
, p
in enumerate ( ps
):
268 srt
+= ' %s \r\n %s --> %s \r\n %s \r\n\r\n ' % ( str ( pos
), p
. get ( 'begin' ), p
. get ( 'end' ), _extract_text ( p
))
271 'url' : connection
. get ( 'href' ),
281 def _download_media_selector ( self
, programme_id
):
283 media_selection
= self
._ download
_ xml
(
284 self
. mediaselector_url
% programme_id
,
285 programme_id
, 'Downloading media selection XML' )
286 except ExtractorError
as ee
:
287 if isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
== 403 :
288 media_selection
= xml
. etree
. ElementTree
. fromstring ( ee
. cause
. read (). decode ( 'utf-8' ))
295 for media
in self
._ extract
_ medias
( media_selection
):
296 kind
= media
. get ( 'kind' )
298 formats
. extend ( self
._ extract
_ audio
( media
, programme_id
))
299 elif kind
== 'video' :
300 formats
. extend ( self
._ extract
_ video
( media
, programme_id
))
301 elif kind
== 'captions' :
302 subtitles
= self
. extract_subtitles ( media
, programme_id
)
304 return formats
, subtitles
306 def _download_playlist ( self
, playlist_id
):
308 playlist
= self
._ download
_ json
(
309 'http://www.bbc.co.uk/programmes/ %s /playlist.json' % playlist_id
,
310 playlist_id
, 'Downloading playlist JSON' )
312 version
= playlist
. get ( 'defaultAvailableVersion' )
314 smp_config
= version
[ 'smpConfig' ]
315 title
= smp_config
[ 'title' ]
316 description
= smp_config
[ 'summary' ]
317 for item
in smp_config
[ 'items' ]:
319 if kind
!= 'programme' and kind
!= 'radioProgramme' :
321 programme_id
= item
. get ( 'vpid' )
322 duration
= int ( item
. get ( 'duration' ))
323 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
324 return programme_id
, title
, description
, duration
, formats
, subtitles
325 except ExtractorError
as ee
:
326 if not ( isinstance ( ee
. cause
, compat_HTTPError
) and ee
. cause
. code
== 404 ):
329 # fallback to legacy playlist
330 playlist
= self
._ download
_ xml
(
331 'http://www.bbc.co.uk/iplayer/playlist/ %s ' % playlist_id
,
332 playlist_id
, 'Downloading legacy playlist XML' )
334 no_items
= playlist
. find ( './ {http://bbc.co.uk/2008/emp/playlist} noItems' )
335 if no_items
is not None :
336 reason
= no_items
. get ( 'reason' )
337 if reason
== 'preAvailability' :
338 msg
= 'Episode %s is not yet available' % playlist_id
339 elif reason
== 'postAvailability' :
340 msg
= 'Episode %s is no longer available' % playlist_id
341 elif reason
== 'noMedia' :
342 msg
= 'Episode %s is not currently available' % playlist_id
344 msg
= 'Episode %s is not available: %s ' % ( playlist_id
, reason
)
345 raise ExtractorError ( msg
, expected
= True )
347 for item
in self
._ extract
_ items
( playlist
):
348 kind
= item
. get ( 'kind' )
349 if kind
!= 'programme' and kind
!= 'radioProgramme' :
351 title
= playlist
. find ( './ {http://bbc.co.uk/2008/emp/playlist} title' ). text
352 description
= playlist
. find ( './ {http://bbc.co.uk/2008/emp/playlist} summary' ). text
353 programme_id
= item
. get ( 'identifier' )
354 duration
= int ( item
. get ( 'duration' ))
355 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
357 return programme_id
, title
, description
, duration
, formats
, subtitles
359 def _real_extract ( self
, url
):
360 group_id
= self
._ match
_ id
( url
)
362 webpage
= self
._ download
_ webpage
( url
, group_id
, 'Downloading video page' )
366 tviplayer
= self
._ search
_ regex
(
367 r
'mediator\.bind\(( {.+?} )\s*,\s*document\.getElementById' ,
368 webpage
, 'player' , default
= None )
371 player
= self
._ parse
_ json
( tviplayer
, group_id
). get ( 'player' , {})
372 duration
= int_or_none ( player
. get ( 'duration' ))
373 programme_id
= player
. get ( 'vpid' )
376 programme_id
= self
._ search
_ regex
(
377 r
'"vpid"\s*:\s*"([\da-z] {8} )"' , webpage
, 'vpid' , fatal
= False , default
= None )
380 formats
, subtitles
= self
._ download
_ media
_ selector
( programme_id
)
381 title
= self
._ og
_ search
_ title
( webpage
)
382 description
= self
._ search
_ regex
(
383 r
'<p class="[^"]*medium-description[^"]*">([^<]+)</p>' ,
384 webpage
, 'description' , fatal
= False )
386 programme_id
, title
, description
, duration
, formats
, subtitles
= self
._ download
_ playlist
( group_id
)
388 self
._ sort
_ formats
( formats
)
393 'description' : description
,
394 'thumbnail' : self
._ og
_ search
_ thumbnail
( webpage
, default
= None ),
395 'duration' : duration
,
397 'subtitles' : subtitles
,
401 class BBCNewsIE ( BBCCoUkIE
):
404 _VALID_URL
= r
'https?://(?:www\.)?bbc\.com/.+?/(?P<id>[^/]+)$'
406 mediaselector_url
= 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ %s '
409 'url' : 'http://www.bbc.com/news/world-europe-32668511' ,
411 'id' : 'world-europe-32668511' ,
412 'title' : 'Russia stages massive WW2 parade despite Western boycott' ,
416 'url' : 'http://www.bbc.com/news/business-28299555' ,
418 'id' : 'business-28299555' ,
419 'title' : 'Farnborough Airshow: Video highlights' ,
423 'url' : 'http://www.bbc.com/news/world-europe-32041533' ,
428 'title' : 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV' ,
429 'description' : 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV' ,
431 'upload_date' : '20150324' ,
432 'uploader' : 'BBC News' ,
435 'skip_download' : True ,
438 'url' : 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu' ,
443 'title' : 'YPG: Tel Abyad \' \u0131n tamam\u0131 kontrol \xfc m \xfc zde' ,
444 'description' : 'YPG: Tel Abyad \' \u0131n tamam\u0131 kontrol \xfc m \xfc zde' ,
446 'upload_date' : '20150615' ,
447 'uploader' : 'BBC News' ,
450 'skip_download' : True ,
453 'url' : 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw' ,
458 'title' : 'Honduras militariza sus hospitales por nuevo esc \xe1 ndalo de corrupci \xf3 n' ,
459 'description' : 'Honduras militariza sus hospitales por nuevo esc \xe1 ndalo de corrupci \xf3 n' ,
461 'upload_date' : '20150619' ,
462 'uploader' : 'BBC News' ,
465 'skip_download' : True ,
469 def _real_extract ( self
, url
):
470 list_id
= self
._ match
_ id
( url
)
471 webpage
= self
._ download
_ webpage
( url
, list_id
)
473 list_title
= self
._ html
_ search
_ regex
( r
'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>' , webpage
, 'list title' )
475 pubdate
= self
._ html
_ search
_ regex
( r
'"datePublished":\s*"(\d+-\d+-\d+)' , webpage
, 'date' , default
= None )
477 pubdate
= pubdate
. replace ( '-' , '' )
482 # works with bbc.com/news/something-something-123456 articles
484 lambda m
: self
._ parse
_ json
( m
, list_id
),
485 re
. findall ( r
"data-media-meta='( {[^']+} )'" , webpage
)
489 # http://www.bbc.com/news/video_and_audio/international
490 # and single-video articles
491 masset
= self
._ html
_ search
_ regex
( r
'mediaAssetPage\.init\(\s*( {.+?} ), "/' , webpage
, 'mediaassets' , default
= None )
493 jmasset
= self
._ parse
_ json
( masset
, list_id
)
494 for key
, val
in jmasset
. get ( 'videos' ,{}). items ():
495 for skey
, sval
in val
. items ():
500 # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
501 # in http://www.bbc.com/news/video_and_audio/international
502 # prone to breaking if entries have sourceFiles list
504 lambda m
: self
._ parse
_ json
( m
, list_id
),
505 re
. findall ( r
"( {[^{} ]+image\" : {[^}
]+}[ ^
}]+}) ", webpage)
509 raise ExtractorError('No video found', expected=True)
512 programme_id = jent.get('externalId')
513 xml_url = jent.get('href')
515 title = jent.get('caption',list_title)
517 duration = parse_duration(jent.get('duration'))
518 description = list_title
519 if jent.get('caption'):
520 description += ' - ' + jent.get('caption')
522 if jent.has_key('image'):
523 thumbnail=jent['image'].get('href')
529 formats, subtitles = self._download_media_selector(programme_id)
530 elif jent.has_key('sourceFiles'):
531 # mediaselector not used at
532 # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu
533 for key, val in jent['sourceFiles'].items():
535 'ext': val.get('encoding'),
536 'url': val.get('url'),
537 'filesize': int(val.get('filesize')),
542 # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
543 xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
544 programme_id = self._search_regex(r'<mediator [^>]*identifier=" (.+ ?
) "', xml, 'playlist.sxml (externalId fallback)')
545 formats, subtitles = self._download_media_selector(programme_id)
547 if len(formats) == 0:
548 raise ExtractorError('unsupported json media entry. \n '+str(jent)+' \n ')
550 self._sort_formats(formats)
552 id = jent.get('id') if programme_id == None else programme_id
558 'uploader': 'BBC News',
559 'upload_date': pubdate,
561 'description': description,
562 'thumbnail': thumbnail,
563 'duration': duration,
565 'subtitles': subtitles,
569 return self.playlist_result(ret, list_id, list_title)
570 raise ExtractorError('No video found', expected=True)