]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/cspan.py
[br] Simplify
[yt-dlp.git] / youtube_dl / extractor / cspan.py
CommitLineData
ca9e7922
PH
1from __future__ import unicode_literals
2
aa0c8739
JMF
3import re
4
5from .common import InfoExtractor
6from ..utils import (
ca9e7922 7 unescapeHTML,
009a3408 8 find_xpath_attr,
aa0c8739
JMF
9)
10
ca9e7922 11
aa0c8739 12class CSpanIE(InfoExtractor):
009a3408 13 _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)'
ca9e7922 14 IE_DESC = 'C-SPAN'
6f5ac90c 15 _TEST = {
009a3408 16 'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
ca9e7922
PH
17 'md5': '8e44ce11f0f725527daccc453f553eb0',
18 'info_dict': {
009a3408
JMF
19 'id': '315139',
20 'ext': 'mp4',
ca9e7922 21 'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
009a3408 22 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
6f5ac90c 23 },
11577ec0 24 'skip': 'Regularly fails on travis, for unknown reasons',
6f5ac90c 25 }
aa0c8739
JMF
26
27 def _real_extract(self, url):
28 mobj = re.match(self._VALID_URL, url)
009a3408
JMF
29 page_id = mobj.group('id')
30 webpage = self._download_webpage(url, page_id)
31 video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id')
32
33 description = self._html_search_regex(
34 [
35 # The full description
36 r'<div class=\'expandable\'>(.*?)<a href=\'#\'',
37 # If the description is small enough the other div is not
38 # present, otherwise this is a stripped version
39 r'<p class=\'initial\'>(.*?)</p>'
40 ],
41 webpage, 'description', flags=re.DOTALL)
ca9e7922
PH
42
43 info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
009a3408 44 data = self._download_json(info_url, video_id)
ca9e7922
PH
45
46 url = unescapeHTML(data['video']['files'][0]['path']['#text'])
47
009a3408
JMF
48 doc = self._download_xml('http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
49 video_id)
50
51 def find_string(s):
52 return find_xpath_attr(doc, './/string', 'name', s).text
53
ca9e7922
PH
54 return {
55 'id': video_id,
009a3408 56 'title': find_string('title'),
ca9e7922
PH
57 'url': url,
58 'description': description,
009a3408 59 'thumbnail': find_string('poster'),
ca9e7922 60 }