-from __future__ import unicode_literals
-
import re
from .common import InfoExtractor
+from .senategov import SenateISVPIE
+from .ustream import UstreamIE
+from ..compat import compat_HTMLParseError
from ..utils import (
- determine_ext,
ExtractorError,
+ determine_ext,
extract_attributes,
find_xpath_attr,
get_element_by_attribute,
get_element_by_class,
int_or_none,
+ join_nonempty,
js_to_json,
merge_dicts,
parse_iso8601,
+ parse_qs,
smuggle_url,
str_to_int,
unescapeHTML,
)
-from .senateisvp import SenateISVPIE
-from .ustream import UstreamIE
class CSpanIE(InfoExtractor):
'ext': 'mp4',
'title': 'CSPAN - International Health Care Models',
'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
- }
+ },
}, {
'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
'info_dict': {
},
'params': {
'skip_download': True, # m3u8 downloads
- }
+ },
}, {
# Ustream embedded video
'url': 'https://www.c-span.org/video/?114917-1/armed-services',
ext = 'vtt'
subtitle['ext'] = ext
ld_info = self._search_json_ld(webpage, video_id, default={})
- title = get_element_by_class('video-page-title', webpage) or \
- self._og_search_title(webpage)
+ try:
+ title = get_element_by_class('video-page-title', webpage)
+ except compat_HTMLParseError:
+ title = None
+ if title is None:
+ title = self._og_search_title(webpage)
description = get_element_by_attribute('itemprop', 'description', webpage) or \
self._html_search_meta(['og:description', 'description'], webpage)
return merge_dicts(info, ld_info, {
# Obsolete
# We first look for clipid, because clipprog always appears before
- patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
+ patterns = [rf'id=\'clip({t})\'\s*value=\'([0-9]+)\'' for t in ('id', 'prog')]
results = list(filter(None, (re.search(p, webpage) for p in patterns)))
if results:
matches = results[0]
video_id = m.group('id')
video_type = 'program' if m.group('type') == 'prog' else 'clip'
else:
- senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+ senate_isvp_url = SenateISVPIE._extract_url(webpage)
if senate_isvp_url:
title = self._og_search_title(webpage)
surl = smuggle_url(senate_isvp_url, {'force_title': title})
return d.get(attr, {}).get('#text')
data = self._download_json(
- 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),
+ f'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5={video_type}&id={video_id}',
video_id)['video']
if data['@status'] != 'Success':
- raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)
+ raise ExtractorError('{} said: {}'.format(self.IE_NAME, get_text_attr(data, 'error')), expected=True)
doc = self._download_xml(
- 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),
+ f'http://www.c-span.org/common/services/flashXml.php?{video_type}id={video_id}',
video_id)
description = self._html_search_meta('description', webpage)
formats = []
for quality in f.get('qualities', []):
formats.append({
- 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),
+ 'format_id': '{}-{}p'.format(get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),
'url': unescapeHTML(get_text_attr(quality, 'file')),
'height': int_or_none(get_text_attr(quality, 'height')),
'tbr': int_or_none(get_text_attr(quality, 'bitrate')),
continue
formats = self._extract_m3u8_formats(
path, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }]
+ m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path}]
add_referer(formats)
- self._sort_formats(formats)
entries.append({
- 'id': '%s_%d' % (video_id, partnum + 1),
+ 'id': f'{video_id}_{partnum + 1}',
'title': (
title if len(files) == 1 else
- '%s part %d' % (title, partnum + 1)),
+ f'{title} part {partnum + 1}'),
'formats': formats,
'description': description,
'thumbnail': thumbnail,
'subtitles': {
'en': [{
'url': capfile,
- 'ext': determine_ext(capfile, 'dfxp')
+ 'ext': determine_ext(capfile, 'dfxp'),
}],
} if capfile else None,
})
'title': title,
'id': 'c' + video_id if video_type == 'clip' else video_id,
}
+
+
+class CSpanCongressIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/'
+ _TESTS = [{
+ 'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380',
+ 'info_dict': {
+ 'id': 'house_2017-12-13',
+ 'title': 'Congressional Chronicle - Members of Congress, Hearings and More',
+ 'description': 'md5:54c264b7a8f219937987610243305a84',
+ 'thumbnail': r're:https://ximage.c-spanvideo.org/.+',
+ 'ext': 'mp4',
+ },
+ }]
+
+ def _real_extract(self, url):
+ query = parse_qs(url)
+ video_date = query.get('date', [None])[0]
+ video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_')
+ webpage = self._download_webpage(url, video_id)
+ if not video_date:
+ jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P<date>\d{4}-\d{2}-\d{2})\';', webpage)
+ if jwp_date:
+ video_id = f'{video_id}_{jwp_date.group("date")}'
+ jwplayer_data = self._parse_json(
+ self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'),
+ video_id, transform_source=js_to_json)
+
+ title = self._generic_title('', webpage)
+ description = (self._og_search_description(webpage, default=None)
+ or self._html_search_meta('description', webpage, 'description', default=None))
+
+ return {
+ **self._parse_jwplayer_data(jwplayer_data, video_id, False),
+ 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(),
+ 'description': description,
+ 'http_headers': {'Referer': 'https://www.c-span.org/'},
+ }