X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/2098aee7d6facb9de2253f9b71a2dfa9b932b4cb..5886b38d73c54239c85c3e0d8e7c1585d1bbb7da:/youtube_dl/extractor/bbc.py
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index ce99a34ab..2dfcee98d 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -10,7 +10,6 @@
int_or_none,
parse_duration,
parse_iso8601,
- remove_end,
unescapeHTML,
)
from ..compat import (
@@ -86,7 +85,7 @@ class BBCCoUkIE(InfoExtractor):
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Voice UK: Series 3: Blind Auditions 5',
- 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
+ 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
'duration': 5100,
},
'params': {
@@ -193,6 +192,19 @@ class BBCCoUkIE(InfoExtractor):
# rtmp download
'skip_download': True,
},
+ }, {
+ # compact player (https://github.com/rg3/youtube-dl/issues/8147)
+ 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
+ 'info_dict': {
+ 'id': 'p028bfkj',
+ 'ext': 'flv',
+ 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
+ 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True,
@@ -482,9 +494,11 @@ def _real_extract(self, url):
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
title = self._og_search_title(webpage, default=None) or self._html_search_regex(
- r'
]+class="info"[^>]*>\s*
(.+?)
'), webpage, 'title')
description = self._search_regex(
- r'
([^<]+)
',
+ (r'
([^<]+)
',
+ r'
]+class="info_+synopsis"[^>]*>([^<]+)
'),
webpage, 'description', default=None)
if not description:
description = self._html_search_meta('description', webpage)
@@ -546,7 +560,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
- 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ 'title': 'BUGGER',
},
'playlist_count': 18,
}, {
@@ -655,9 +669,17 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': '34475836',
- 'title': 'What Liverpool can expect from Klopp',
+ 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
},
'playlist_count': 3,
+ }, {
+ # school report article with single video
+ 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+ 'info_dict': {
+ 'id': '35744779',
+ 'title': 'School which breaks down barriers in Jerusalem',
+ },
+ 'playlist_count': 1,
}, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
@@ -718,19 +740,19 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, playlist_id)
- timestamp = None
- playlist_title = None
- playlist_description = None
+ json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
+ timestamp = json_ld_info.get('timestamp')
+
+ playlist_title = json_ld_info.get('title')
+ if not playlist_title:
+ playlist_title = self._og_search_title(
+ webpage, default=None) or self._html_search_regex(
+ r'
(.+?)', webpage, 'playlist title', default=None)
+ if playlist_title:
+ playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
- ld = self._parse_json(
- self._search_regex(
- r'(?s)',
- webpage, 'ld json', default='{}'),
- playlist_id, fatal=False)
- if ld:
- timestamp = parse_iso8601(ld.get('datePublished'))
- playlist_title = ld.get('headline')
- playlist_description = ld.get('articleBody')
+ playlist_description = json_ld_info.get(
+ 'description') or self._og_search_description(webpage, default=None)
if not timestamp:
timestamp = parse_iso8601(self._search_regex(
@@ -791,8 +813,6 @@ def _real_extract(self, url):
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
if entries:
- playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
- playlist_description = playlist_description or self._og_search_description(webpage, default=None)
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
@@ -823,10 +843,6 @@ def _real_extract(self, url):
'subtitles': subtitles,
}
- playlist_title = self._html_search_regex(
- r'
(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title')
- playlist_description = self._og_search_description(webpage, default=None)
-
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
@@ -926,7 +942,7 @@ def extract_all(pattern):
class BBCCoUkArticleIE(InfoExtractor):
- _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P
[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)'
IE_NAME = 'bbc.co.uk:article'
IE_DESC = 'BBC articles'