]>
Commit | Line | Data |
---|---|---|
19a03940 | 1 | import contextlib |
4c54b89e S |
2 | import json |
3 | import os | |
4c54b89e S |
4 | import subprocess |
5 | import tempfile | |
17f8deeb | 6 | |
f8271158 | 7 | from ..compat import compat_urlparse |
9e3c2f1d | 8 | from ..utils import ( |
f8271158 | 9 | ExtractorError, |
10 | Popen, | |
4c54b89e | 11 | check_executable, |
4c54b89e S |
12 | get_exe_version, |
13 | is_outdated_version, | |
9e3c2f1d | 14 | ) |
2bfeee69 YCH |
15 | |
16 | ||
4c54b89e S |
17 | def cookie_to_dict(cookie): |
18 | cookie_dict = { | |
19 | 'name': cookie.name, | |
20 | 'value': cookie.value, | |
21 | } | |
22 | if cookie.port_specified: | |
23 | cookie_dict['port'] = cookie.port | |
24 | if cookie.domain_specified: | |
25 | cookie_dict['domain'] = cookie.domain | |
26 | if cookie.path_specified: | |
27 | cookie_dict['path'] = cookie.path | |
28 | if cookie.expires is not None: | |
29 | cookie_dict['expires'] = cookie.expires | |
30 | if cookie.secure is not None: | |
31 | cookie_dict['secure'] = cookie.secure | |
32 | if cookie.discard is not None: | |
33 | cookie_dict['discard'] = cookie.discard | |
19a03940 | 34 | with contextlib.suppress(TypeError): |
3089bc74 S |
35 | if (cookie.has_nonstandard_attr('httpOnly') |
36 | or cookie.has_nonstandard_attr('httponly') | |
37 | or cookie.has_nonstandard_attr('HttpOnly')): | |
4c54b89e | 38 | cookie_dict['httponly'] = True |
4c54b89e S |
39 | return cookie_dict |
40 | ||
41 | ||
42 | def cookie_jar_to_list(cookie_jar): | |
43 | return [cookie_to_dict(cookie) for cookie in cookie_jar] | |
44 | ||
45 | ||
86e5f3ed | 46 | class PhantomJSwrapper: |
4c54b89e S |
47 | """PhantomJS wrapper class |
48 | ||
49 | This class is experimental. | |
50 | """ | |
51 | ||
52 | _TEMPLATE = r''' | |
53 | phantom.onError = function(msg, trace) {{ | |
54 | var msgStack = ['PHANTOM ERROR: ' + msg]; | |
55 | if(trace && trace.length) {{ | |
56 | msgStack.push('TRACE:'); | |
57 | trace.forEach(function(t) {{ | |
58 | msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line | |
59 | + (t.function ? ' (in function ' + t.function +')' : '')); | |
60 | }}); | |
61 | }} | |
62 | console.error(msgStack.join('\n')); | |
63 | phantom.exit(1); | |
64 | }}; | |
65 | var page = require('webpage').create(); | |
66 | var fs = require('fs'); | |
67 | var read = {{ mode: 'r', charset: 'utf-8' }}; | |
68 | var write = {{ mode: 'w', charset: 'utf-8' }}; | |
69 | JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ | |
70 | phantom.addCookie(x); | |
71 | }}); | |
72 | page.settings.resourceTimeout = {timeout}; | |
73 | page.settings.userAgent = "{ua}"; | |
74 | page.onLoadStarted = function() {{ | |
75 | page.evaluate(function() {{ | |
76 | delete window._phantom; | |
77 | delete window.callPhantom; | |
78 | }}); | |
79 | }}; | |
80 | var saveAndExit = function() {{ | |
81 | fs.write("{html}", page.content, write); | |
82 | fs.write("{cookies}", JSON.stringify(phantom.cookies), write); | |
83 | phantom.exit(); | |
84 | }}; | |
85 | page.onLoadFinished = function(status) {{ | |
86 | if(page.url === "") {{ | |
87 | page.setContent(fs.read("{html}", read), "{url}"); | |
88 | }} | |
89 | else {{ | |
90 | {jscode} | |
91 | }} | |
92 | }}; | |
93 | page.open(""); | |
94 | ''' | |
95 | ||
96 | _TMP_FILE_NAMES = ['script', 'html', 'cookies'] | |
97 | ||
98 | @staticmethod | |
99 | def _version(): | |
100 | return get_exe_version('phantomjs', version_re=r'([0-9.]+)') | |
101 | ||
102 | def __init__(self, extractor, required_version=None, timeout=10000): | |
a7541958 YCH |
103 | self._TMP_FILES = {} |
104 | ||
4c54b89e S |
105 | self.exe = check_executable('phantomjs', ['-v']) |
106 | if not self.exe: | |
107 | raise ExtractorError('PhantomJS executable not found in PATH, ' | |
108 | 'download it from http://phantomjs.org', | |
109 | expected=True) | |
110 | ||
111 | self.extractor = extractor | |
112 | ||
113 | if required_version: | |
114 | version = self._version() | |
115 | if is_outdated_version(version, required_version): | |
116 | self.extractor._downloader.report_warning( | |
117 | 'Your copy of PhantomJS is outdated, update it to version ' | |
118 | '%s or newer if you encounter any errors.' % required_version) | |
119 | ||
120 | self.options = { | |
121 | 'timeout': timeout, | |
122 | } | |
4c54b89e S |
123 | for name in self._TMP_FILE_NAMES: |
124 | tmp = tempfile.NamedTemporaryFile(delete=False) | |
125 | tmp.close() | |
126 | self._TMP_FILES[name] = tmp | |
127 | ||
128 | def __del__(self): | |
129 | for name in self._TMP_FILE_NAMES: | |
19a03940 | 130 | with contextlib.suppress(OSError, KeyError): |
4c54b89e | 131 | os.remove(self._TMP_FILES[name].name) |
4c54b89e S |
132 | |
133 | def _save_cookies(self, url): | |
9809740b | 134 | cookies = cookie_jar_to_list(self.extractor.cookiejar) |
4c54b89e S |
135 | for cookie in cookies: |
136 | if 'path' not in cookie: | |
137 | cookie['path'] = '/' | |
138 | if 'domain' not in cookie: | |
139 | cookie['domain'] = compat_urlparse.urlparse(url).netloc | |
140 | with open(self._TMP_FILES['cookies'].name, 'wb') as f: | |
141 | f.write(json.dumps(cookies).encode('utf-8')) | |
142 | ||
143 | def _load_cookies(self): | |
144 | with open(self._TMP_FILES['cookies'].name, 'rb') as f: | |
145 | cookies = json.loads(f.read().decode('utf-8')) | |
146 | for cookie in cookies: | |
147 | if cookie['httponly'] is True: | |
148 | cookie['rest'] = {'httpOnly': None} | |
149 | if 'expiry' in cookie: | |
150 | cookie['expire_time'] = cookie['expiry'] | |
f9934b96 | 151 | self.extractor._set_cookie(**cookie) |
4c54b89e S |
152 | |
153 | def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): | |
154 | """ | |
155 | Downloads webpage (if needed) and executes JS | |
156 | ||
157 | Params: | |
158 | url: website url | |
159 | html: optional, html code of website | |
160 | video_id: video id | |
161 | note: optional, displayed when downloading webpage | |
162 | note2: optional, displayed when executing JS | |
163 | headers: custom http headers | |
164 | jscode: code to be executed when page is loaded | |
165 | ||
166 | Returns tuple with: | |
167 | * downloaded website (after JS execution) | |
168 | * anything you print with `console.log` (but not inside `page.execute`!) | |
169 | ||
170 | In most cases you don't need to add any `jscode`. | |
171 | It is executed in `page.onLoadFinished`. | |
172 | `saveAndExit();` is mandatory, use it instead of `phantom.exit()` | |
173 | It is possible to wait for some element on the webpage, for example: | |
174 | var check = function() { | |
175 | var elementFound = page.evaluate(function() { | |
176 | return document.querySelector('#b.done') !== null; | |
177 | }); | |
178 | if(elementFound) | |
179 | saveAndExit(); | |
180 | else | |
181 | window.setTimeout(check, 500); | |
182 | } | |
183 | ||
184 | page.evaluate(function(){ | |
185 | document.querySelector('#a').click(); | |
186 | }); | |
187 | check(); | |
188 | """ | |
189 | if 'saveAndExit();' not in jscode: | |
190 | raise ExtractorError('`saveAndExit();` not found in `jscode`') | |
191 | if not html: | |
192 | html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) | |
193 | with open(self._TMP_FILES['html'].name, 'wb') as f: | |
194 | f.write(html.encode('utf-8')) | |
195 | ||
196 | self._save_cookies(url) | |
197 | ||
198 | replaces = self.options | |
199 | replaces['url'] = url | |
aeb21b98 | 200 | user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] |
4c54b89e S |
201 | replaces['ua'] = user_agent.replace('"', '\\"') |
202 | replaces['jscode'] = jscode | |
203 | ||
204 | for x in self._TMP_FILE_NAMES: | |
205 | replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') | |
206 | ||
207 | with open(self._TMP_FILES['script'].name, 'wb') as f: | |
208 | f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) | |
209 | ||
210 | if video_id is None: | |
86e5f3ed | 211 | self.extractor.to_screen(f'{note2}') |
4c54b89e | 212 | else: |
86e5f3ed | 213 | self.extractor.to_screen(f'{video_id}: {note2}') |
4c54b89e | 214 | |
f0c9fb96 | 215 | stdout, stderr, returncode = Popen.run( |
d3c93ec2 | 216 | [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name], |
f0c9fb96 | 217 | text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
218 | if returncode: | |
e121e3ce | 219 | raise ExtractorError(f'Executing JS failed:\n{stderr}') |
4c54b89e S |
220 | with open(self._TMP_FILES['html'].name, 'rb') as f: |
221 | html = f.read().decode('utf-8') | |
222 | ||
223 | self._load_cookies() | |
224 | ||
e121e3ce | 225 | return html, stdout |