]>
Commit | Line | Data |
---|---|---|
587021cd | 1 | import collections |
19a03940 | 2 | import contextlib |
4c54b89e S |
3 | import json |
4 | import os | |
4c54b89e S |
5 | import subprocess |
6 | import tempfile | |
17f8deeb | 7 | |
f8271158 | 8 | from ..compat import compat_urlparse |
9e3c2f1d | 9 | from ..utils import ( |
f8271158 | 10 | ExtractorError, |
11 | Popen, | |
4c54b89e | 12 | check_executable, |
587021cd | 13 | format_field, |
4c54b89e S |
14 | get_exe_version, |
15 | is_outdated_version, | |
587021cd | 16 | shell_quote, |
9e3c2f1d | 17 | ) |
2bfeee69 YCH |
18 | |
19 | ||
4c54b89e S |
20 | def cookie_to_dict(cookie): |
21 | cookie_dict = { | |
22 | 'name': cookie.name, | |
23 | 'value': cookie.value, | |
24 | } | |
25 | if cookie.port_specified: | |
26 | cookie_dict['port'] = cookie.port | |
27 | if cookie.domain_specified: | |
28 | cookie_dict['domain'] = cookie.domain | |
29 | if cookie.path_specified: | |
30 | cookie_dict['path'] = cookie.path | |
31 | if cookie.expires is not None: | |
32 | cookie_dict['expires'] = cookie.expires | |
33 | if cookie.secure is not None: | |
34 | cookie_dict['secure'] = cookie.secure | |
35 | if cookie.discard is not None: | |
36 | cookie_dict['discard'] = cookie.discard | |
19a03940 | 37 | with contextlib.suppress(TypeError): |
3089bc74 S |
38 | if (cookie.has_nonstandard_attr('httpOnly') |
39 | or cookie.has_nonstandard_attr('httponly') | |
40 | or cookie.has_nonstandard_attr('HttpOnly')): | |
4c54b89e | 41 | cookie_dict['httponly'] = True |
4c54b89e S |
42 | return cookie_dict |
43 | ||
44 | ||
45 | def cookie_jar_to_list(cookie_jar): | |
46 | return [cookie_to_dict(cookie) for cookie in cookie_jar] | |
47 | ||
48 | ||
86e5f3ed | 49 | class PhantomJSwrapper: |
4c54b89e S |
50 | """PhantomJS wrapper class |
51 | ||
52 | This class is experimental. | |
53 | """ | |
54 | ||
587021cd | 55 | _BASE_JS = R''' |
4c54b89e S |
56 | phantom.onError = function(msg, trace) {{ |
57 | var msgStack = ['PHANTOM ERROR: ' + msg]; | |
58 | if(trace && trace.length) {{ | |
59 | msgStack.push('TRACE:'); | |
60 | trace.forEach(function(t) {{ | |
61 | msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line | |
62 | + (t.function ? ' (in function ' + t.function +')' : '')); | |
63 | }}); | |
64 | }} | |
65 | console.error(msgStack.join('\n')); | |
66 | phantom.exit(1); | |
67 | }}; | |
587021cd | 68 | ''' |
69 | ||
70 | _TEMPLATE = R''' | |
4c54b89e S |
71 | var page = require('webpage').create(); |
72 | var fs = require('fs'); | |
73 | var read = {{ mode: 'r', charset: 'utf-8' }}; | |
74 | var write = {{ mode: 'w', charset: 'utf-8' }}; | |
75 | JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ | |
76 | phantom.addCookie(x); | |
77 | }}); | |
78 | page.settings.resourceTimeout = {timeout}; | |
79 | page.settings.userAgent = "{ua}"; | |
80 | page.onLoadStarted = function() {{ | |
81 | page.evaluate(function() {{ | |
82 | delete window._phantom; | |
83 | delete window.callPhantom; | |
84 | }}); | |
85 | }}; | |
86 | var saveAndExit = function() {{ | |
87 | fs.write("{html}", page.content, write); | |
88 | fs.write("{cookies}", JSON.stringify(phantom.cookies), write); | |
89 | phantom.exit(); | |
90 | }}; | |
91 | page.onLoadFinished = function(status) {{ | |
92 | if(page.url === "") {{ | |
93 | page.setContent(fs.read("{html}", read), "{url}"); | |
94 | }} | |
95 | else {{ | |
96 | {jscode} | |
97 | }} | |
98 | }}; | |
99 | page.open(""); | |
100 | ''' | |
101 | ||
102 | _TMP_FILE_NAMES = ['script', 'html', 'cookies'] | |
103 | ||
104 | @staticmethod | |
105 | def _version(): | |
106 | return get_exe_version('phantomjs', version_re=r'([0-9.]+)') | |
107 | ||
108 | def __init__(self, extractor, required_version=None, timeout=10000): | |
a7541958 YCH |
109 | self._TMP_FILES = {} |
110 | ||
4c54b89e S |
111 | self.exe = check_executable('phantomjs', ['-v']) |
112 | if not self.exe: | |
c6e07cf1 | 113 | raise ExtractorError( |
114 | 'PhantomJS not found, Please download it from https://phantomjs.org/download.html', expected=True) | |
4c54b89e S |
115 | |
116 | self.extractor = extractor | |
117 | ||
118 | if required_version: | |
119 | version = self._version() | |
120 | if is_outdated_version(version, required_version): | |
121 | self.extractor._downloader.report_warning( | |
122 | 'Your copy of PhantomJS is outdated, update it to version ' | |
123 | '%s or newer if you encounter any errors.' % required_version) | |
124 | ||
4c54b89e S |
125 | for name in self._TMP_FILE_NAMES: |
126 | tmp = tempfile.NamedTemporaryFile(delete=False) | |
127 | tmp.close() | |
128 | self._TMP_FILES[name] = tmp | |
129 | ||
587021cd | 130 | self.options = collections.ChainMap({ |
131 | 'timeout': timeout, | |
132 | }, { | |
133 | x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') | |
134 | for x in self._TMP_FILE_NAMES | |
135 | }) | |
136 | ||
4c54b89e S |
137 | def __del__(self): |
138 | for name in self._TMP_FILE_NAMES: | |
19a03940 | 139 | with contextlib.suppress(OSError, KeyError): |
4c54b89e | 140 | os.remove(self._TMP_FILES[name].name) |
4c54b89e S |
141 | |
142 | def _save_cookies(self, url): | |
9809740b | 143 | cookies = cookie_jar_to_list(self.extractor.cookiejar) |
4c54b89e S |
144 | for cookie in cookies: |
145 | if 'path' not in cookie: | |
146 | cookie['path'] = '/' | |
147 | if 'domain' not in cookie: | |
148 | cookie['domain'] = compat_urlparse.urlparse(url).netloc | |
149 | with open(self._TMP_FILES['cookies'].name, 'wb') as f: | |
150 | f.write(json.dumps(cookies).encode('utf-8')) | |
151 | ||
152 | def _load_cookies(self): | |
153 | with open(self._TMP_FILES['cookies'].name, 'rb') as f: | |
154 | cookies = json.loads(f.read().decode('utf-8')) | |
155 | for cookie in cookies: | |
156 | if cookie['httponly'] is True: | |
157 | cookie['rest'] = {'httpOnly': None} | |
158 | if 'expiry' in cookie: | |
159 | cookie['expire_time'] = cookie['expiry'] | |
f9934b96 | 160 | self.extractor._set_cookie(**cookie) |
4c54b89e S |
161 | |
162 | def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): | |
163 | """ | |
164 | Downloads webpage (if needed) and executes JS | |
165 | ||
166 | Params: | |
167 | url: website url | |
168 | html: optional, html code of website | |
169 | video_id: video id | |
170 | note: optional, displayed when downloading webpage | |
171 | note2: optional, displayed when executing JS | |
172 | headers: custom http headers | |
173 | jscode: code to be executed when page is loaded | |
174 | ||
175 | Returns tuple with: | |
176 | * downloaded website (after JS execution) | |
177 | * anything you print with `console.log` (but not inside `page.execute`!) | |
178 | ||
179 | In most cases you don't need to add any `jscode`. | |
180 | It is executed in `page.onLoadFinished`. | |
181 | `saveAndExit();` is mandatory, use it instead of `phantom.exit()` | |
62b58c09 | 182 | It is possible to wait for some element on the webpage, e.g. |
4c54b89e S |
183 | var check = function() { |
184 | var elementFound = page.evaluate(function() { | |
185 | return document.querySelector('#b.done') !== null; | |
186 | }); | |
187 | if(elementFound) | |
188 | saveAndExit(); | |
189 | else | |
190 | window.setTimeout(check, 500); | |
191 | } | |
192 | ||
193 | page.evaluate(function(){ | |
194 | document.querySelector('#a').click(); | |
195 | }); | |
196 | check(); | |
197 | """ | |
198 | if 'saveAndExit();' not in jscode: | |
199 | raise ExtractorError('`saveAndExit();` not found in `jscode`') | |
200 | if not html: | |
201 | html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) | |
202 | with open(self._TMP_FILES['html'].name, 'wb') as f: | |
203 | f.write(html.encode('utf-8')) | |
204 | ||
205 | self._save_cookies(url) | |
206 | ||
aeb21b98 | 207 | user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] |
587021cd | 208 | jscode = self._TEMPLATE.format_map(self.options.new_child({ |
209 | 'url': url, | |
210 | 'ua': user_agent.replace('"', '\\"'), | |
211 | 'jscode': jscode, | |
212 | })) | |
4c54b89e | 213 | |
587021cd | 214 | stdout = self.execute(jscode, video_id, note2) |
4c54b89e | 215 | |
4c54b89e S |
216 | with open(self._TMP_FILES['html'].name, 'rb') as f: |
217 | html = f.read().decode('utf-8') | |
4c54b89e S |
218 | self._load_cookies() |
219 | ||
e121e3ce | 220 | return html, stdout |
587021cd | 221 | |
992dc6b4 | 222 | def execute(self, jscode, video_id=None, *, note='Executing JS'): |
587021cd | 223 | """Execute JS and return stdout""" |
224 | if 'phantom.exit();' not in jscode: | |
225 | jscode += ';\nphantom.exit();' | |
226 | jscode = self._BASE_JS + jscode | |
227 | ||
228 | with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f: | |
229 | f.write(jscode) | |
230 | self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') | |
231 | ||
232 | cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name] | |
233 | self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') | |
992dc6b4 | 234 | try: |
235 | stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000, | |
236 | text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
237 | except Exception as e: | |
238 | raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) | |
587021cd | 239 | if returncode: |
992dc6b4 | 240 | raise ExtractorError(f'{note} failed:\n{stderr.strip()}') |
587021cd | 241 | |
242 | return stdout |