]>
Commit | Line | Data |
---|---|---|
2bfeee69 | 1 | # coding: utf-8 |
6c20a0bb | 2 | from __future__ import unicode_literals |
95ad9ce5 | 3 | |
4c54b89e S |
4 | import json |
5 | import os | |
4c54b89e S |
6 | import subprocess |
7 | import tempfile | |
17f8deeb | 8 | |
011da618 S |
9 | from ..compat import ( |
10 | compat_urlparse, | |
11 | compat_kwargs, | |
12 | ) | |
9e3c2f1d | 13 | from ..utils import ( |
4c54b89e | 14 | check_executable, |
4c54b89e | 15 | encodeArgument, |
9e3c2f1d | 16 | ExtractorError, |
4c54b89e S |
17 | get_exe_version, |
18 | is_outdated_version, | |
19 | std_headers, | |
f5b1bca9 | 20 | process_communicate_or_kill, |
9e3c2f1d | 21 | ) |
2bfeee69 YCH |
22 | |
23 | ||
4c54b89e S |
24 | def cookie_to_dict(cookie): |
25 | cookie_dict = { | |
26 | 'name': cookie.name, | |
27 | 'value': cookie.value, | |
28 | } | |
29 | if cookie.port_specified: | |
30 | cookie_dict['port'] = cookie.port | |
31 | if cookie.domain_specified: | |
32 | cookie_dict['domain'] = cookie.domain | |
33 | if cookie.path_specified: | |
34 | cookie_dict['path'] = cookie.path | |
35 | if cookie.expires is not None: | |
36 | cookie_dict['expires'] = cookie.expires | |
37 | if cookie.secure is not None: | |
38 | cookie_dict['secure'] = cookie.secure | |
39 | if cookie.discard is not None: | |
40 | cookie_dict['discard'] = cookie.discard | |
41 | try: | |
3089bc74 S |
42 | if (cookie.has_nonstandard_attr('httpOnly') |
43 | or cookie.has_nonstandard_attr('httponly') | |
44 | or cookie.has_nonstandard_attr('HttpOnly')): | |
4c54b89e S |
45 | cookie_dict['httponly'] = True |
46 | except TypeError: | |
47 | pass | |
48 | return cookie_dict | |
49 | ||
50 | ||
51 | def cookie_jar_to_list(cookie_jar): | |
52 | return [cookie_to_dict(cookie) for cookie in cookie_jar] | |
53 | ||
54 | ||
55 | class PhantomJSwrapper(object): | |
56 | """PhantomJS wrapper class | |
57 | ||
58 | This class is experimental. | |
59 | """ | |
60 | ||
61 | _TEMPLATE = r''' | |
62 | phantom.onError = function(msg, trace) {{ | |
63 | var msgStack = ['PHANTOM ERROR: ' + msg]; | |
64 | if(trace && trace.length) {{ | |
65 | msgStack.push('TRACE:'); | |
66 | trace.forEach(function(t) {{ | |
67 | msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line | |
68 | + (t.function ? ' (in function ' + t.function +')' : '')); | |
69 | }}); | |
70 | }} | |
71 | console.error(msgStack.join('\n')); | |
72 | phantom.exit(1); | |
73 | }}; | |
74 | var page = require('webpage').create(); | |
75 | var fs = require('fs'); | |
76 | var read = {{ mode: 'r', charset: 'utf-8' }}; | |
77 | var write = {{ mode: 'w', charset: 'utf-8' }}; | |
78 | JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ | |
79 | phantom.addCookie(x); | |
80 | }}); | |
81 | page.settings.resourceTimeout = {timeout}; | |
82 | page.settings.userAgent = "{ua}"; | |
83 | page.onLoadStarted = function() {{ | |
84 | page.evaluate(function() {{ | |
85 | delete window._phantom; | |
86 | delete window.callPhantom; | |
87 | }}); | |
88 | }}; | |
89 | var saveAndExit = function() {{ | |
90 | fs.write("{html}", page.content, write); | |
91 | fs.write("{cookies}", JSON.stringify(phantom.cookies), write); | |
92 | phantom.exit(); | |
93 | }}; | |
94 | page.onLoadFinished = function(status) {{ | |
95 | if(page.url === "") {{ | |
96 | page.setContent(fs.read("{html}", read), "{url}"); | |
97 | }} | |
98 | else {{ | |
99 | {jscode} | |
100 | }} | |
101 | }}; | |
102 | page.open(""); | |
103 | ''' | |
104 | ||
105 | _TMP_FILE_NAMES = ['script', 'html', 'cookies'] | |
106 | ||
107 | @staticmethod | |
108 | def _version(): | |
109 | return get_exe_version('phantomjs', version_re=r'([0-9.]+)') | |
110 | ||
111 | def __init__(self, extractor, required_version=None, timeout=10000): | |
a7541958 YCH |
112 | self._TMP_FILES = {} |
113 | ||
4c54b89e S |
114 | self.exe = check_executable('phantomjs', ['-v']) |
115 | if not self.exe: | |
116 | raise ExtractorError('PhantomJS executable not found in PATH, ' | |
117 | 'download it from http://phantomjs.org', | |
118 | expected=True) | |
119 | ||
120 | self.extractor = extractor | |
121 | ||
122 | if required_version: | |
123 | version = self._version() | |
124 | if is_outdated_version(version, required_version): | |
125 | self.extractor._downloader.report_warning( | |
126 | 'Your copy of PhantomJS is outdated, update it to version ' | |
127 | '%s or newer if you encounter any errors.' % required_version) | |
128 | ||
129 | self.options = { | |
130 | 'timeout': timeout, | |
131 | } | |
4c54b89e S |
132 | for name in self._TMP_FILE_NAMES: |
133 | tmp = tempfile.NamedTemporaryFile(delete=False) | |
134 | tmp.close() | |
135 | self._TMP_FILES[name] = tmp | |
136 | ||
137 | def __del__(self): | |
138 | for name in self._TMP_FILE_NAMES: | |
139 | try: | |
140 | os.remove(self._TMP_FILES[name].name) | |
a7541958 | 141 | except (IOError, OSError, KeyError): |
4c54b89e S |
142 | pass |
143 | ||
144 | def _save_cookies(self, url): | |
145 | cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) | |
146 | for cookie in cookies: | |
147 | if 'path' not in cookie: | |
148 | cookie['path'] = '/' | |
149 | if 'domain' not in cookie: | |
150 | cookie['domain'] = compat_urlparse.urlparse(url).netloc | |
151 | with open(self._TMP_FILES['cookies'].name, 'wb') as f: | |
152 | f.write(json.dumps(cookies).encode('utf-8')) | |
153 | ||
154 | def _load_cookies(self): | |
155 | with open(self._TMP_FILES['cookies'].name, 'rb') as f: | |
156 | cookies = json.loads(f.read().decode('utf-8')) | |
157 | for cookie in cookies: | |
158 | if cookie['httponly'] is True: | |
159 | cookie['rest'] = {'httpOnly': None} | |
160 | if 'expiry' in cookie: | |
161 | cookie['expire_time'] = cookie['expiry'] | |
011da618 | 162 | self.extractor._set_cookie(**compat_kwargs(cookie)) |
4c54b89e S |
163 | |
164 | def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): | |
165 | """ | |
166 | Downloads webpage (if needed) and executes JS | |
167 | ||
168 | Params: | |
169 | url: website url | |
170 | html: optional, html code of website | |
171 | video_id: video id | |
172 | note: optional, displayed when downloading webpage | |
173 | note2: optional, displayed when executing JS | |
174 | headers: custom http headers | |
175 | jscode: code to be executed when page is loaded | |
176 | ||
177 | Returns tuple with: | |
178 | * downloaded website (after JS execution) | |
179 | * anything you print with `console.log` (but not inside `page.execute`!) | |
180 | ||
181 | In most cases you don't need to add any `jscode`. | |
182 | It is executed in `page.onLoadFinished`. | |
183 | `saveAndExit();` is mandatory, use it instead of `phantom.exit()` | |
184 | It is possible to wait for some element on the webpage, for example: | |
185 | var check = function() { | |
186 | var elementFound = page.evaluate(function() { | |
187 | return document.querySelector('#b.done') !== null; | |
188 | }); | |
189 | if(elementFound) | |
190 | saveAndExit(); | |
191 | else | |
192 | window.setTimeout(check, 500); | |
193 | } | |
194 | ||
195 | page.evaluate(function(){ | |
196 | document.querySelector('#a').click(); | |
197 | }); | |
198 | check(); | |
199 | """ | |
200 | if 'saveAndExit();' not in jscode: | |
201 | raise ExtractorError('`saveAndExit();` not found in `jscode`') | |
202 | if not html: | |
203 | html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) | |
204 | with open(self._TMP_FILES['html'].name, 'wb') as f: | |
205 | f.write(html.encode('utf-8')) | |
206 | ||
207 | self._save_cookies(url) | |
208 | ||
209 | replaces = self.options | |
210 | replaces['url'] = url | |
211 | user_agent = headers.get('User-Agent') or std_headers['User-Agent'] | |
212 | replaces['ua'] = user_agent.replace('"', '\\"') | |
213 | replaces['jscode'] = jscode | |
214 | ||
215 | for x in self._TMP_FILE_NAMES: | |
216 | replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') | |
217 | ||
218 | with open(self._TMP_FILES['script'].name, 'wb') as f: | |
219 | f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) | |
220 | ||
221 | if video_id is None: | |
222 | self.extractor.to_screen('%s' % (note2,)) | |
223 | else: | |
224 | self.extractor.to_screen('%s: %s' % (video_id, note2)) | |
225 | ||
226 | p = subprocess.Popen([ | |
227 | self.exe, '--ssl-protocol=any', | |
228 | self._TMP_FILES['script'].name | |
229 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
f5b1bca9 | 230 | out, err = process_communicate_or_kill(p) |
4c54b89e S |
231 | if p.returncode != 0: |
232 | raise ExtractorError( | |
233 | 'Executing JS failed\n:' + encodeArgument(err)) | |
234 | with open(self._TMP_FILES['html'].name, 'rb') as f: | |
235 | html = f.read().decode('utf-8') | |
236 | ||
237 | self._load_cookies() | |
238 | ||
239 | return (html, encodeArgument(out)) |