]>
Commit | Line | Data |
---|---|---|
38cbc40a | 1 | import re |
38cbc40a PH |
2 | |
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
38cbc40a | 5 | compat_parse_qs, |
38cbc40a PH |
6 | compat_urllib_parse, |
7 | compat_urllib_request, | |
896d5b63 | 8 | determine_ext, |
38cbc40a PH |
9 | ExtractorError, |
10 | ) | |
11 | ||
12 | class MetacafeIE(InfoExtractor): | |
13 | """Information Extractor for metacafe.com.""" | |
14 | ||
15 | _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' | |
16 | _DISCLAIMER = 'http://www.metacafe.com/family_filter/' | |
17 | _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' | |
18 | IE_NAME = u'metacafe' | |
66cf3ac3 JMF |
19 | _TESTS = [ |
20 | # Youtube video | |
21 | { | |
83f6f68e PH |
22 | u"add_ie": ["Youtube"], |
23 | u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", | |
ea32fbac | 24 | u"file": u"_aUehQsCQtM.mp4", |
83f6f68e PH |
25 | u"info_dict": { |
26 | u"upload_date": u"20090102", | |
27 | u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!", | |
28 | u"description": u"md5:2439a8ef6d5a70e380c22f5ad323e5a8", | |
29 | u"uploader": u"PBS", | |
30 | u"uploader_id": u"PBS" | |
31 | } | |
896d5b63 | 32 | }, |
66cf3ac3 JMF |
33 | # Normal metacafe video |
34 | { | |
35 | u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', | |
36 | u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad', | |
37 | u'info_dict': { | |
38 | u'id': u'11121940', | |
39 | u'ext': u'mp4', | |
40 | u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4', | |
41 | u'uploader': u'ign', | |
42 | u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', | |
43 | }, | |
44 | }, | |
45 | # AnyClip video | |
896d5b63 PH |
46 | { |
47 | u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/", | |
48 | u"file": u"an-dVVXnuY7Jh77J.mp4", | |
49 | u"info_dict": { | |
50 | u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3", | |
bf854541 | 51 | u"uploader": u"anyclip", |
66cf3ac3 JMF |
52 | u"description": u"md5:38c711dd98f5bb87acf973d573442e67", |
53 | }, | |
54 | }, | |
55 | # age-restricted video | |
56 | { | |
57 | u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', | |
58 | u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09', | |
59 | u'info_dict': { | |
60 | u'id': u'5186653', | |
61 | u'ext': u'mp4', | |
62 | u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', | |
63 | u'uploader': u'Dwayne Pipe', | |
64 | u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b', | |
65 | u'age_limit': 18, | |
66 | }, | |
67 | }, | |
b9a2c538 JMF |
68 | # cbs video |
69 | { | |
70 | u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/', | |
71 | u'info_dict': { | |
72 | u'id': u'0rOxMBabDXN6', | |
73 | u'ext': u'flv', | |
74 | u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet', | |
75 | u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d', | |
76 | u'duration': 129, | |
77 | }, | |
78 | u'params': { | |
79 | # rtmp download | |
80 | u'skip_download': True, | |
81 | }, | |
82 | }, | |
66cf3ac3 | 83 | ] |
83f6f68e | 84 | |
38cbc40a PH |
85 | |
86 | def report_disclaimer(self): | |
87 | """Report disclaimer retrieval.""" | |
88 | self.to_screen(u'Retrieving disclaimer') | |
89 | ||
90 | def _real_initialize(self): | |
91 | # Retrieve disclaimer | |
baa7b197 JMF |
92 | self.report_disclaimer() |
93 | self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer') | |
38cbc40a PH |
94 | |
95 | # Confirm age | |
96 | disclaimer_form = { | |
97 | 'filters': '0', | |
98 | 'submit': "Continue - I'm over 18", | |
99 | } | |
100 | request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) | |
66cf3ac3 | 101 | request.add_header('Content-Type', 'application/x-www-form-urlencoded') |
baa7b197 JMF |
102 | self.report_age_confirmation() |
103 | self._download_webpage(request, None, False, u'Unable to confirm age') | |
38cbc40a PH |
104 | |
105 | def _real_extract(self, url): | |
106 | # Extract id and simplified title from URL | |
107 | mobj = re.match(self._VALID_URL, url) | |
108 | if mobj is None: | |
109 | raise ExtractorError(u'Invalid URL: %s' % url) | |
110 | ||
111 | video_id = mobj.group(1) | |
112 | ||
b9a2c538 JMF |
113 | # the video may come from an external site |
114 | m_external = re.match('^(\w{2})-(.*)$', video_id) | |
115 | if m_external is not None: | |
116 | prefix, ext_id = m_external.groups() | |
117 | # Check if video comes from YouTube | |
118 | if prefix == 'yt': | |
119 | return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube') | |
120 | # CBS videos use theplatform.com | |
121 | if prefix == 'cb': | |
122 | return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') | |
38cbc40a PH |
123 | |
124 | # Retrieve video webpage to extract further information | |
896d5b63 | 125 | req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) |
66cf3ac3 JMF |
126 | |
127 | # AnyClip videos require the flashversion cookie so that we get the link | |
128 | # to the mp4 file | |
129 | mobj_an = re.match(r'^an-(.*?)$', video_id) | |
130 | if mobj_an: | |
131 | req.headers['Cookie'] = 'flashVersion=0;' | |
896d5b63 | 132 | webpage = self._download_webpage(req, video_id) |
38cbc40a PH |
133 | |
134 | # Extract URL, uploader and title from webpage | |
135 | self.report_extraction(video_id) | |
136 | mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) | |
137 | if mobj is not None: | |
138 | mediaURL = compat_urllib_parse.unquote(mobj.group(1)) | |
896d5b63 | 139 | video_ext = mediaURL[-3:] |
38cbc40a PH |
140 | |
141 | # Extract gdaKey if available | |
142 | mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) | |
143 | if mobj is None: | |
144 | video_url = mediaURL | |
145 | else: | |
146 | gdaKey = mobj.group(1) | |
147 | video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) | |
148 | else: | |
896d5b63 PH |
149 | mobj = re.search(r'<video src="([^"]+)"', webpage) |
150 | if mobj: | |
151 | video_url = mobj.group(1) | |
152 | video_ext = 'mp4' | |
153 | else: | |
154 | mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) | |
155 | if mobj is None: | |
156 | raise ExtractorError(u'Unable to extract media URL') | |
157 | vardict = compat_parse_qs(mobj.group(1)) | |
158 | if 'mediaData' not in vardict: | |
159 | raise ExtractorError(u'Unable to extract media URL') | |
160 | mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0]) | |
161 | if mobj is None: | |
162 | raise ExtractorError(u'Unable to extract media URL') | |
163 | mediaURL = mobj.group('mediaURL').replace('\\/', '/') | |
164 | video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) | |
165 | video_ext = determine_ext(video_url) | |
38cbc40a | 166 | |
ec00e1d8 | 167 | video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title') |
7e24b09d | 168 | description = self._og_search_description(webpage) |
f085f960 | 169 | video_uploader = self._html_search_regex( |
6c758d79 | 170 | r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', |
f085f960 | 171 | webpage, u'uploader nickname', fatal=False) |
38cbc40a | 172 | |
66cf3ac3 JMF |
173 | if re.search(r'"contentRating":"restricted"', webpage) is not None: |
174 | age_limit = 18 | |
175 | else: | |
176 | age_limit = 0 | |
177 | ||
5910724b PH |
178 | return { |
179 | '_type': 'video', | |
896d5b63 PH |
180 | 'id': video_id, |
181 | 'url': video_url, | |
7e24b09d | 182 | 'description': description, |
896d5b63 | 183 | 'uploader': video_uploader, |
38cbc40a PH |
184 | 'upload_date': None, |
185 | 'title': video_title, | |
896d5b63 | 186 | 'ext': video_ext, |
66cf3ac3 | 187 | 'age_limit': age_limit, |
5910724b | 188 | } |