]>
Commit | Line | Data |
---|---|---|
cfe50f04 JMF |
1 | # encoding: utf-8 |
2 | ||
9b122384 PH |
3 | import os |
4 | import re | |
5 | ||
6 | from .common import InfoExtractor | |
7 | from ..utils import ( | |
8 | compat_urllib_error, | |
9 | compat_urllib_parse, | |
10 | compat_urllib_request, | |
a5caba1e | 11 | compat_urlparse, |
9b122384 PH |
12 | |
13 | ExtractorError, | |
aa94a6d3 | 14 | HEADRequest, |
9d4660ca PH |
15 | smuggle_url, |
16 | unescapeHTML, | |
42393ce2 PH |
17 | unified_strdate, |
18 | url_basename, | |
9b122384 | 19 | ) |
cfe50f04 | 20 | from .brightcove import BrightcoveIE |
c0d0b01f | 21 | from .ooyala import OoyalaIE |
9b122384 | 22 | |
0838239e | 23 | |
9b122384 | 24 | class GenericIE(InfoExtractor): |
0f818663 | 25 | IE_DESC = u'Generic downloader that works on some sites' |
9b122384 PH |
26 | _VALID_URL = r'.*' |
27 | IE_NAME = u'generic' | |
cfe50f04 JMF |
28 | _TESTS = [ |
29 | { | |
30 | u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', | |
31 | u'file': u'13601338388002.mp4', | |
aa929c37 | 32 | u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd', |
cfe50f04 | 33 | u'info_dict': { |
0838239e | 34 | u"uploader": u"www.hodiho.fr", |
cfe50f04 JMF |
35 | u"title": u"R\u00e9gis plante sa Jeep" |
36 | } | |
37 | }, | |
9d4660ca PH |
38 | # embedded vimeo video |
39 | { | |
9ee2b5f6 | 40 | u'add_ie': ['Vimeo'], |
9d4660ca PH |
41 | u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', |
42 | u'file': u'22444065.mp4', | |
43 | u'md5': u'2903896e23df39722c33f015af0666e2', | |
44 | u'info_dict': { | |
45 | u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', | |
46 | u"uploader_id": u"skillsmatter", | |
47 | u"uploader": u"Skills Matter", | |
48 | } | |
c19f7764 JMF |
49 | }, |
50 | # bandcamp page with custom domain | |
51 | { | |
9ee2b5f6 | 52 | u'add_ie': ['Bandcamp'], |
c19f7764 JMF |
53 | u'url': u'http://bronyrock.com/track/the-pony-mash', |
54 | u'file': u'3235767654.mp3', | |
55 | u'info_dict': { | |
56 | u'title': u'The Pony Mash', | |
57 | u'uploader': u'M_Pallante', | |
58 | }, | |
59 | u'skip': u'There is a limit of 200 free downloads / month for the test song', | |
60 | }, | |
eeb165e6 | 61 | # embedded brightcove video |
dd5bcdc4 JMF |
62 | # it also tests brightcove videos that need to set the 'Referer' in the |
63 | # http requests | |
eeb165e6 JMF |
64 | { |
65 | u'add_ie': ['Brightcove'], | |
dd5bcdc4 | 66 | u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', |
eeb165e6 | 67 | u'info_dict': { |
dd5bcdc4 | 68 | u'id': u'2765128793001', |
eeb165e6 | 69 | u'ext': u'mp4', |
dd5bcdc4 JMF |
70 | u'title': u'Le cours de bourse : l’analyse technique', |
71 | u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9', | |
72 | u'uploader': u'BFM BUSINESS', | |
eeb165e6 JMF |
73 | }, |
74 | u'params': { | |
75 | u'skip_download': True, | |
76 | }, | |
77 | }, | |
42393ce2 PH |
78 | # Direct link to a video |
79 | { | |
80 | u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4', | |
81 | u'file': u'trailer.mp4', | |
82 | u'md5': u'67d406c2bcb6af27fa886f31aa934bbe', | |
83 | u'info_dict': { | |
84 | u'id': u'trailer', | |
85 | u'title': u'trailer', | |
86 | u'upload_date': u'20100513', | |
87 | } | |
c0d0b01f JMF |
88 | }, |
89 | # ooyala video | |
90 | { | |
91 | u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', | |
92 | u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c', | |
93 | u'info_dict': { | |
94 | u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', | |
95 | u'ext': u'mp4', | |
96 | u'title': u'2cc213299525360.mov', #that's what we get | |
97 | }, | |
98 | }, | |
cfe50f04 | 99 | ] |
9b122384 PH |
100 | |
101 | def report_download_webpage(self, video_id): | |
102 | """Report webpage download.""" | |
103 | if not self._downloader.params.get('test', False): | |
104 | self._downloader.report_warning(u'Falling back on generic information extractor.') | |
105 | super(GenericIE, self).report_download_webpage(video_id) | |
106 | ||
107 | def report_following_redirect(self, new_url): | |
108 | """Report information extraction.""" | |
109 | self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) | |
110 | ||
42393ce2 | 111 | def _send_head(self, url): |
9b122384 | 112 | """Check if it is a redirect, like url shorteners, in case return the new url.""" |
9b122384 PH |
113 | |
114 | class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): | |
115 | """ | |
116 | Subclass the HTTPRedirectHandler to make it use our | |
aa94a6d3 | 117 | HEADRequest also on the redirected URL |
9b122384 PH |
118 | """ |
119 | def redirect_request(self, req, fp, code, msg, headers, newurl): | |
120 | if code in (301, 302, 303, 307): | |
121 | newurl = newurl.replace(' ', '%20') | |
122 | newheaders = dict((k,v) for k,v in req.headers.items() | |
123 | if k.lower() not in ("content-length", "content-type")) | |
aa94a6d3 | 124 | return HEADRequest(newurl, |
9b122384 PH |
125 | headers=newheaders, |
126 | origin_req_host=req.get_origin_req_host(), | |
127 | unverifiable=True) | |
128 | else: | |
129 | raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) | |
130 | ||
131 | class HTTPMethodFallback(compat_urllib_request.BaseHandler): | |
132 | """ | |
133 | Fallback to GET if HEAD is not allowed (405 HTTP error) | |
134 | """ | |
135 | def http_error_405(self, req, fp, code, msg, headers): | |
136 | fp.read() | |
137 | fp.close() | |
138 | ||
139 | newheaders = dict((k,v) for k,v in req.headers.items() | |
140 | if k.lower() not in ("content-length", "content-type")) | |
141 | return self.parent.open(compat_urllib_request.Request(req.get_full_url(), | |
142 | headers=newheaders, | |
143 | origin_req_host=req.get_origin_req_host(), | |
144 | unverifiable=True)) | |
145 | ||
146 | # Build our opener | |
147 | opener = compat_urllib_request.OpenerDirector() | |
148 | for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, | |
149 | HTTPMethodFallback, HEADRedirectHandler, | |
150 | compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: | |
151 | opener.add_handler(handler()) | |
152 | ||
aa94a6d3 | 153 | response = opener.open(HEADRequest(url)) |
9b122384 PH |
154 | if response is None: |
155 | raise ExtractorError(u'Invalid URL protocol') | |
42393ce2 | 156 | return response |
9b122384 PH |
157 | |
158 | def _real_extract(self, url): | |
a7130543 JMF |
159 | parsed_url = compat_urlparse.urlparse(url) |
160 | if not parsed_url.scheme: | |
161 | self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') | |
162 | return self.url_result('http://' + url) | |
42393ce2 | 163 | video_id = os.path.splitext(url.split('/')[-1])[0] |
a7130543 | 164 | |
30934689 | 165 | try: |
42393ce2 PH |
166 | response = self._send_head(url) |
167 | ||
168 | # Check for redirect | |
169 | new_url = response.geturl() | |
170 | if url != new_url: | |
171 | self.report_following_redirect(new_url) | |
cecaaf3f | 172 | return self.url_result(new_url) |
42393ce2 PH |
173 | |
174 | # Check for direct link to a video | |
175 | content_type = response.headers.get('Content-Type', '') | |
3e785145 | 176 | m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) |
42393ce2 PH |
177 | if m: |
178 | upload_date = response.headers.get('Last-Modified') | |
179 | if upload_date: | |
180 | upload_date = unified_strdate(upload_date) | |
42393ce2 PH |
181 | return { |
182 | 'id': video_id, | |
183 | 'title': os.path.splitext(url_basename(url))[0], | |
184 | 'formats': [{ | |
185 | 'format_id': m.group('format_id'), | |
186 | 'url': url, | |
3e785145 | 187 | 'vcodec': u'none' if m.group('type') == 'audio' else None |
42393ce2 PH |
188 | }], |
189 | 'upload_date': upload_date, | |
190 | } | |
191 | ||
30934689 PH |
192 | except compat_urllib_error.HTTPError: |
193 | # This may be a stupid server that doesn't like HEAD, our UA, or so | |
194 | pass | |
9b122384 | 195 | |
9b122384 PH |
196 | try: |
197 | webpage = self._download_webpage(url, video_id) | |
198 | except ValueError: | |
199 | # since this is the last-resort InfoExtractor, if | |
200 | # this error is thrown, it'll be thrown here | |
e484c81f | 201 | raise ExtractorError(u'Failed to download URL: %s' % url) |
9b122384 PH |
202 | |
203 | self.report_extraction(video_id) | |
887c6acd PH |
204 | |
205 | # it's tempting to parse this further, but you would | |
206 | # have to take into account all the variations like | |
207 | # Video Title - Site Name | |
208 | # Site Name | Video Title | |
209 | # Video Title - Tagline | Site Name | |
210 | # and so on and so forth; it's just not practical | |
ef4fd848 PH |
211 | video_title = self._html_search_regex( |
212 | r'(?s)<title>(.*?)</title>', webpage, u'video title', | |
213 | default=u'video') | |
214 | ||
215 | # video uploader is domain name | |
216 | video_uploader = self._search_regex( | |
217 | r'^(?:https?://)?([^/]*)/.*', url, u'video uploader') | |
887c6acd | 218 | |
627a91a9 | 219 | # Look for BrightCove: |
eeb165e6 JMF |
220 | bc_url = BrightcoveIE._extract_brightcove_url(webpage) |
221 | if bc_url is not None: | |
cfe50f04 | 222 | self.to_screen(u'Brightcove video detected.') |
cfe50f04 JMF |
223 | return self.url_result(bc_url, 'Brightcove') |
224 | ||
9d4660ca PH |
225 | # Look for embedded Vimeo player |
226 | mobj = re.search( | |
53c1d3ef | 227 | r'<iframe[^>]+?src="(https?://player.vimeo.com/video/.+?)"', webpage) |
9d4660ca PH |
228 | if mobj: |
229 | player_url = unescapeHTML(mobj.group(1)) | |
230 | surl = smuggle_url(player_url, {'Referer': url}) | |
231 | return self.url_result(surl, 'Vimeo') | |
232 | ||
53c1d3ef | 233 | # Look for embedded YouTube player |
1f9da904 PH |
234 | matches = re.findall(r'''(?x) |
235 | (?:<iframe[^>]+?src=|embedSWF\(\s*) | |
236 | (["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/ | |
237 | (?:embed|v)/.+?) | |
238 | \1''', webpage) | |
887c6acd PH |
239 | if matches: |
240 | urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') | |
241 | for tuppl in matches] | |
242 | return self.playlist_result( | |
243 | urlrs, playlist_id=video_id, playlist_title=video_title) | |
53c1d3ef | 244 | |
355e4fd0 PH |
245 | # Look for embedded Dailymotion player |
246 | matches = re.findall( | |
ef4fd848 | 247 | r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) |
355e4fd0 PH |
248 | if matches: |
249 | urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') | |
250 | for tuppl in matches] | |
251 | return self.playlist_result( | |
252 | urlrs, playlist_id=video_id, playlist_title=video_title) | |
253 | ||
ef4fd848 PH |
254 | # Look for embedded Wistia player |
255 | match = re.search( | |
256 | r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) | |
257 | if match: | |
258 | return { | |
259 | '_type': 'url_transparent', | |
260 | 'url': unescapeHTML(match.group('url')), | |
261 | 'ie_key': 'Wistia', | |
262 | 'uploader': video_uploader, | |
263 | 'title': video_title, | |
264 | 'id': video_id, | |
265 | } | |
266 | ||
ee3e63e4 | 267 | # Look for embedded blip.tv player |
268 | mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage) | |
269 | if mobj: | |
270 | return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV') | |
271 | mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage) | |
272 | if mobj: | |
273 | player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1) | |
274 | player_page = self._download_webpage(player_url, mobj.group(1)) | |
275 | blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False) | |
276 | if blip_video_id: | |
277 | return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV') | |
278 | ||
c19f7764 JMF |
279 | # Look for Bandcamp pages with custom domain |
280 | mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) | |
281 | if mobj is not None: | |
282 | burl = unescapeHTML(mobj.group(1)) | |
09804265 JMF |
283 | # Don't set the extractor because it can be a track url or an album |
284 | return self.url_result(burl) | |
c19f7764 | 285 | |
f25571ff PH |
286 | # Look for embedded Vevo player |
287 | mobj = re.search( | |
288 | r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) | |
289 | if mobj is not None: | |
290 | return self.url_result(mobj.group('url')) | |
291 | ||
c0d0b01f JMF |
292 | # Look for Ooyala videos |
293 | mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage) | |
294 | if mobj is not None: | |
295 | return OoyalaIE._build_url_result(mobj.group(1)) | |
296 | ||
aa94a6d3 PH |
297 | # Look for Aparat videos |
298 | mobj = re.search(r'<iframe src="(http://www.aparat.com/video/[^"]+)"', webpage) | |
299 | if mobj is not None: | |
300 | return self.url_result(mobj.group(1), 'Aparat') | |
301 | ||
9b122384 PH |
302 | # Start with something easy: JW Player in SWFObject |
303 | mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) | |
304 | if mobj is None: | |
305 | # Broaden the search a little bit | |
306 | mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) | |
307 | if mobj is None: | |
308 | # Broaden the search a little bit: JWPlayer JS loader | |
113577e1 | 309 | mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage) |
9b122384 PH |
310 | if mobj is None: |
311 | # Try to find twitter cards info | |
312 | mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) | |
313 | if mobj is None: | |
314 | # We look for Open Graph info: | |
315 | # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) | |
316 | m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) | |
317 | # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: | |
318 | if m_video_type is not None: | |
319 | mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) | |
7fea7156 PH |
320 | if mobj is None: |
321 | # HTML5 video | |
08e291b5 | 322 | mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) |
9b122384 | 323 | if mobj is None: |
e484c81f | 324 | raise ExtractorError(u'Unsupported URL: %s' % url) |
9b122384 PH |
325 | |
326 | # It's possible that one of the regexes | |
327 | # matched, but returned an empty group: | |
328 | if mobj.group(1) is None: | |
e484c81f | 329 | raise ExtractorError(u'Did not find a valid video URL at %s' % url) |
9b122384 | 330 | |
08e291b5 | 331 | video_url = mobj.group(1) |
a5caba1e | 332 | video_url = compat_urlparse.urljoin(url, video_url) |
08e291b5 | 333 | video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) |
9b122384 PH |
334 | |
335 | # here's a fun little line of code for you: | |
9b122384 PH |
336 | video_id = os.path.splitext(video_id)[0] |
337 | ||
113577e1 | 338 | return { |
9b122384 PH |
339 | 'id': video_id, |
340 | 'url': video_url, | |
341 | 'uploader': video_uploader, | |
9b122384 | 342 | 'title': video_title, |
113577e1 | 343 | } |