]>
Commit | Line | Data |
---|---|---|
7beb36a5 | 1 | import re |
7beb36a5 PH |
2 | import xml.etree.ElementTree |
3 | ||
4 | from .common import InfoExtractor | |
5 | from ..utils import ( | |
7beb36a5 | 6 | compat_urllib_parse_urlparse, |
87f78946 | 7 | determine_ext, |
7beb36a5 PH |
8 | |
9 | ExtractorError, | |
10 | ) | |
11 | ||
12 | ||
13 | class CollegeHumorIE(InfoExtractor): | |
39b782b3 | 14 | _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$' |
7beb36a5 | 15 | |
87f78946 | 16 | _TESTS = [{ |
de48adda JMF |
17 | u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', |
18 | u'file': u'6902724.mp4', | |
19 | u'md5': u'1264c12ad95dca142a9f0bf7968105a0', | |
20 | u'info_dict': { | |
21 | u'title': u'Comic-Con Cosplay Catastrophe', | |
22 | u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', | |
23 | }, | |
87f78946 PH |
24 | }, |
25 | { | |
26 | u'url': u'http://www.collegehumor.com/video/3505939/font-conference', | |
27 | u'file': u'3505939.mp4', | |
28 | u'md5': u'c51ca16b82bb456a4397987791a835f5', | |
29 | u'info_dict': { | |
30 | u'title': u'Font Conference', | |
31 | u'description': u'This video wasn\'t long enough, so we made it double-spaced.', | |
32 | }, | |
33 | }] | |
7beb36a5 PH |
34 | |
35 | def _real_extract(self, url): | |
36 | mobj = re.match(self._VALID_URL, url) | |
37 | if mobj is None: | |
38 | raise ExtractorError(u'Invalid URL: %s' % url) | |
39 | video_id = mobj.group('videoid') | |
40 | ||
41 | info = { | |
42 | 'id': video_id, | |
43 | 'uploader': None, | |
44 | 'upload_date': None, | |
45 | } | |
46 | ||
47 | self.report_extraction(video_id) | |
48 | xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id | |
de48adda JMF |
49 | metaXml = self._download_webpage(xmlUrl, video_id, |
50 | u'Downloading info XML', | |
51 | u'Unable to download video info XML') | |
7beb36a5 PH |
52 | |
53 | mdoc = xml.etree.ElementTree.fromstring(metaXml) | |
54 | try: | |
55 | videoNode = mdoc.findall('./video')[0] | |
70fa830e JMF |
56 | youtubeIdNode = videoNode.find('./youtubeID') |
57 | if youtubeIdNode is not None: | |
58 | return self.url_result(youtubeIdNode.text, 'Youtube') | |
7beb36a5 PH |
59 | info['description'] = videoNode.findall('./description')[0].text |
60 | info['title'] = videoNode.findall('./caption')[0].text | |
61 | info['thumbnail'] = videoNode.findall('./thumbnail')[0].text | |
87f78946 | 62 | next_url = videoNode.findall('./file')[0].text |
7beb36a5 PH |
63 | except IndexError: |
64 | raise ExtractorError(u'Invalid metadata XML file') | |
65 | ||
87f78946 PH |
66 | if next_url.endswith(u'manifest.f4m'): |
67 | manifest_url = next_url + '?hdcore=2.10.3' | |
68 | manifestXml = self._download_webpage(manifest_url, video_id, | |
69 | u'Downloading XML manifest', | |
70 | u'Unable to download video info XML') | |
7beb36a5 | 71 | |
87f78946 PH |
72 | adoc = xml.etree.ElementTree.fromstring(manifestXml) |
73 | try: | |
74 | media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] | |
75 | node_id = media_node.attrib['url'] | |
76 | video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text | |
77 | except IndexError as err: | |
78 | raise ExtractorError(u'Invalid manifest file') | |
79 | url_pr = compat_urllib_parse_urlparse(info['thumbnail']) | |
80 | info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') | |
81 | info['ext'] = 'mp4' | |
82 | else: | |
83 | # Old-style direct links | |
84 | info['url'] = next_url | |
85 | info['ext'] = determine_ext(info['url']) | |
7beb36a5 | 86 | |
87f78946 | 87 | return info |