]>
Commit | Line | Data |
---|---|---|
587021cd | 1 | import collections |
19a03940 | 2 | import contextlib |
4c54b89e S |
3 | import json |
4 | import os | |
4c54b89e S |
5 | import subprocess |
6 | import tempfile | |
17f8deeb | 7 | |
f8271158 | 8 | from ..compat import compat_urlparse |
9e3c2f1d | 9 | from ..utils import ( |
f8271158 | 10 | ExtractorError, |
11 | Popen, | |
4c54b89e | 12 | check_executable, |
587021cd | 13 | format_field, |
4c54b89e S |
14 | get_exe_version, |
15 | is_outdated_version, | |
587021cd | 16 | shell_quote, |
9e3c2f1d | 17 | ) |
2bfeee69 YCH |
18 | |
19 | ||
4c54b89e S |
20 | def cookie_to_dict(cookie): |
21 | cookie_dict = { | |
22 | 'name': cookie.name, | |
23 | 'value': cookie.value, | |
24 | } | |
25 | if cookie.port_specified: | |
26 | cookie_dict['port'] = cookie.port | |
27 | if cookie.domain_specified: | |
28 | cookie_dict['domain'] = cookie.domain | |
29 | if cookie.path_specified: | |
30 | cookie_dict['path'] = cookie.path | |
31 | if cookie.expires is not None: | |
32 | cookie_dict['expires'] = cookie.expires | |
33 | if cookie.secure is not None: | |
34 | cookie_dict['secure'] = cookie.secure | |
35 | if cookie.discard is not None: | |
36 | cookie_dict['discard'] = cookie.discard | |
19a03940 | 37 | with contextlib.suppress(TypeError): |
3089bc74 S |
38 | if (cookie.has_nonstandard_attr('httpOnly') |
39 | or cookie.has_nonstandard_attr('httponly') | |
40 | or cookie.has_nonstandard_attr('HttpOnly')): | |
4c54b89e | 41 | cookie_dict['httponly'] = True |
4c54b89e S |
42 | return cookie_dict |
43 | ||
44 | ||
45 | def cookie_jar_to_list(cookie_jar): | |
46 | return [cookie_to_dict(cookie) for cookie in cookie_jar] | |
47 | ||
48 | ||
86e5f3ed | 49 | class PhantomJSwrapper: |
4c54b89e S |
50 | """PhantomJS wrapper class |
51 | ||
52 | This class is experimental. | |
53 | """ | |
54 | ||
d81ba7d4 | 55 | INSTALL_HINT = 'Please download it from https://phantomjs.org/download.html' |
56 | ||
587021cd | 57 | _BASE_JS = R''' |
4c54b89e S |
58 | phantom.onError = function(msg, trace) {{ |
59 | var msgStack = ['PHANTOM ERROR: ' + msg]; | |
60 | if(trace && trace.length) {{ | |
61 | msgStack.push('TRACE:'); | |
62 | trace.forEach(function(t) {{ | |
63 | msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line | |
64 | + (t.function ? ' (in function ' + t.function +')' : '')); | |
65 | }}); | |
66 | }} | |
67 | console.error(msgStack.join('\n')); | |
68 | phantom.exit(1); | |
69 | }}; | |
587021cd | 70 | ''' |
71 | ||
72 | _TEMPLATE = R''' | |
4c54b89e S |
73 | var page = require('webpage').create(); |
74 | var fs = require('fs'); | |
75 | var read = {{ mode: 'r', charset: 'utf-8' }}; | |
76 | var write = {{ mode: 'w', charset: 'utf-8' }}; | |
77 | JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ | |
78 | phantom.addCookie(x); | |
79 | }}); | |
80 | page.settings.resourceTimeout = {timeout}; | |
81 | page.settings.userAgent = "{ua}"; | |
82 | page.onLoadStarted = function() {{ | |
83 | page.evaluate(function() {{ | |
84 | delete window._phantom; | |
85 | delete window.callPhantom; | |
86 | }}); | |
87 | }}; | |
88 | var saveAndExit = function() {{ | |
89 | fs.write("{html}", page.content, write); | |
90 | fs.write("{cookies}", JSON.stringify(phantom.cookies), write); | |
91 | phantom.exit(); | |
92 | }}; | |
93 | page.onLoadFinished = function(status) {{ | |
94 | if(page.url === "") {{ | |
95 | page.setContent(fs.read("{html}", read), "{url}"); | |
96 | }} | |
97 | else {{ | |
98 | {jscode} | |
99 | }} | |
100 | }}; | |
101 | page.open(""); | |
102 | ''' | |
103 | ||
104 | _TMP_FILE_NAMES = ['script', 'html', 'cookies'] | |
105 | ||
106 | @staticmethod | |
107 | def _version(): | |
108 | return get_exe_version('phantomjs', version_re=r'([0-9.]+)') | |
109 | ||
110 | def __init__(self, extractor, required_version=None, timeout=10000): | |
a7541958 YCH |
111 | self._TMP_FILES = {} |
112 | ||
4c54b89e S |
113 | self.exe = check_executable('phantomjs', ['-v']) |
114 | if not self.exe: | |
d81ba7d4 | 115 | raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) |
4c54b89e S |
116 | |
117 | self.extractor = extractor | |
118 | ||
119 | if required_version: | |
120 | version = self._version() | |
121 | if is_outdated_version(version, required_version): | |
122 | self.extractor._downloader.report_warning( | |
123 | 'Your copy of PhantomJS is outdated, update it to version ' | |
124 | '%s or newer if you encounter any errors.' % required_version) | |
125 | ||
4c54b89e S |
126 | for name in self._TMP_FILE_NAMES: |
127 | tmp = tempfile.NamedTemporaryFile(delete=False) | |
128 | tmp.close() | |
129 | self._TMP_FILES[name] = tmp | |
130 | ||
587021cd | 131 | self.options = collections.ChainMap({ |
132 | 'timeout': timeout, | |
133 | }, { | |
134 | x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') | |
135 | for x in self._TMP_FILE_NAMES | |
136 | }) | |
137 | ||
4c54b89e S |
138 | def __del__(self): |
139 | for name in self._TMP_FILE_NAMES: | |
19a03940 | 140 | with contextlib.suppress(OSError, KeyError): |
4c54b89e | 141 | os.remove(self._TMP_FILES[name].name) |
4c54b89e S |
142 | |
143 | def _save_cookies(self, url): | |
9809740b | 144 | cookies = cookie_jar_to_list(self.extractor.cookiejar) |
4c54b89e S |
145 | for cookie in cookies: |
146 | if 'path' not in cookie: | |
147 | cookie['path'] = '/' | |
148 | if 'domain' not in cookie: | |
149 | cookie['domain'] = compat_urlparse.urlparse(url).netloc | |
150 | with open(self._TMP_FILES['cookies'].name, 'wb') as f: | |
151 | f.write(json.dumps(cookies).encode('utf-8')) | |
152 | ||
153 | def _load_cookies(self): | |
154 | with open(self._TMP_FILES['cookies'].name, 'rb') as f: | |
155 | cookies = json.loads(f.read().decode('utf-8')) | |
156 | for cookie in cookies: | |
157 | if cookie['httponly'] is True: | |
158 | cookie['rest'] = {'httpOnly': None} | |
159 | if 'expiry' in cookie: | |
160 | cookie['expire_time'] = cookie['expiry'] | |
f9934b96 | 161 | self.extractor._set_cookie(**cookie) |
4c54b89e S |
162 | |
163 | def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): | |
164 | """ | |
165 | Downloads webpage (if needed) and executes JS | |
166 | ||
167 | Params: | |
168 | url: website url | |
169 | html: optional, html code of website | |
170 | video_id: video id | |
171 | note: optional, displayed when downloading webpage | |
172 | note2: optional, displayed when executing JS | |
173 | headers: custom http headers | |
174 | jscode: code to be executed when page is loaded | |
175 | ||
176 | Returns tuple with: | |
177 | * downloaded website (after JS execution) | |
178 | * anything you print with `console.log` (but not inside `page.execute`!) | |
179 | ||
180 | In most cases you don't need to add any `jscode`. | |
181 | It is executed in `page.onLoadFinished`. | |
182 | `saveAndExit();` is mandatory, use it instead of `phantom.exit()` | |
62b58c09 | 183 | It is possible to wait for some element on the webpage, e.g. |
4c54b89e S |
184 | var check = function() { |
185 | var elementFound = page.evaluate(function() { | |
186 | return document.querySelector('#b.done') !== null; | |
187 | }); | |
188 | if(elementFound) | |
189 | saveAndExit(); | |
190 | else | |
191 | window.setTimeout(check, 500); | |
192 | } | |
193 | ||
194 | page.evaluate(function(){ | |
195 | document.querySelector('#a').click(); | |
196 | }); | |
197 | check(); | |
198 | """ | |
199 | if 'saveAndExit();' not in jscode: | |
200 | raise ExtractorError('`saveAndExit();` not found in `jscode`') | |
201 | if not html: | |
202 | html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) | |
203 | with open(self._TMP_FILES['html'].name, 'wb') as f: | |
204 | f.write(html.encode('utf-8')) | |
205 | ||
206 | self._save_cookies(url) | |
207 | ||
aeb21b98 | 208 | user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] |
587021cd | 209 | jscode = self._TEMPLATE.format_map(self.options.new_child({ |
210 | 'url': url, | |
211 | 'ua': user_agent.replace('"', '\\"'), | |
212 | 'jscode': jscode, | |
213 | })) | |
4c54b89e | 214 | |
69082b38 | 215 | stdout = self.execute(jscode, video_id, note=note2) |
4c54b89e | 216 | |
4c54b89e S |
217 | with open(self._TMP_FILES['html'].name, 'rb') as f: |
218 | html = f.read().decode('utf-8') | |
4c54b89e S |
219 | self._load_cookies() |
220 | ||
e121e3ce | 221 | return html, stdout |
587021cd | 222 | |
992dc6b4 | 223 | def execute(self, jscode, video_id=None, *, note='Executing JS'): |
587021cd | 224 | """Execute JS and return stdout""" |
225 | if 'phantom.exit();' not in jscode: | |
226 | jscode += ';\nphantom.exit();' | |
227 | jscode = self._BASE_JS + jscode | |
228 | ||
229 | with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f: | |
230 | f.write(jscode) | |
231 | self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') | |
232 | ||
233 | cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name] | |
234 | self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') | |
992dc6b4 | 235 | try: |
236 | stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000, | |
237 | text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
238 | except Exception as e: | |
239 | raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) | |
587021cd | 240 | if returncode: |
d81ba7d4 | 241 | raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') |
587021cd | 242 | |
243 | return stdout |