]>
Commit | Line | Data |
---|---|---|
2bfeee69 | 1 | # coding: utf-8 |
6c20a0bb | 2 | from __future__ import unicode_literals |
95ad9ce5 | 3 | |
4c54b89e S |
4 | import json |
5 | import os | |
17f8deeb | 6 | import re |
4c54b89e S |
7 | import subprocess |
8 | import tempfile | |
17f8deeb | 9 | |
2bfeee69 | 10 | from .common import InfoExtractor |
011da618 S |
11 | from ..compat import ( |
12 | compat_urlparse, | |
13 | compat_kwargs, | |
14 | ) | |
9e3c2f1d | 15 | from ..utils import ( |
4c54b89e | 16 | check_executable, |
594b0c4c | 17 | determine_ext, |
4c54b89e | 18 | encodeArgument, |
9e3c2f1d | 19 | ExtractorError, |
da57ebaf | 20 | get_element_by_id, |
4c54b89e S |
21 | get_exe_version, |
22 | is_outdated_version, | |
23 | std_headers, | |
9e3c2f1d | 24 | ) |
2bfeee69 YCH |
25 | |
26 | ||
4c54b89e S |
27 | def cookie_to_dict(cookie): |
28 | cookie_dict = { | |
29 | 'name': cookie.name, | |
30 | 'value': cookie.value, | |
31 | } | |
32 | if cookie.port_specified: | |
33 | cookie_dict['port'] = cookie.port | |
34 | if cookie.domain_specified: | |
35 | cookie_dict['domain'] = cookie.domain | |
36 | if cookie.path_specified: | |
37 | cookie_dict['path'] = cookie.path | |
38 | if cookie.expires is not None: | |
39 | cookie_dict['expires'] = cookie.expires | |
40 | if cookie.secure is not None: | |
41 | cookie_dict['secure'] = cookie.secure | |
42 | if cookie.discard is not None: | |
43 | cookie_dict['discard'] = cookie.discard | |
44 | try: | |
45 | if (cookie.has_nonstandard_attr('httpOnly') or | |
46 | cookie.has_nonstandard_attr('httponly') or | |
47 | cookie.has_nonstandard_attr('HttpOnly')): | |
48 | cookie_dict['httponly'] = True | |
49 | except TypeError: | |
50 | pass | |
51 | return cookie_dict | |
52 | ||
53 | ||
54 | def cookie_jar_to_list(cookie_jar): | |
55 | return [cookie_to_dict(cookie) for cookie in cookie_jar] | |
56 | ||
57 | ||
58 | class PhantomJSwrapper(object): | |
59 | """PhantomJS wrapper class | |
60 | ||
61 | This class is experimental. | |
62 | """ | |
63 | ||
64 | _TEMPLATE = r''' | |
65 | phantom.onError = function(msg, trace) {{ | |
66 | var msgStack = ['PHANTOM ERROR: ' + msg]; | |
67 | if(trace && trace.length) {{ | |
68 | msgStack.push('TRACE:'); | |
69 | trace.forEach(function(t) {{ | |
70 | msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line | |
71 | + (t.function ? ' (in function ' + t.function +')' : '')); | |
72 | }}); | |
73 | }} | |
74 | console.error(msgStack.join('\n')); | |
75 | phantom.exit(1); | |
76 | }}; | |
77 | var page = require('webpage').create(); | |
78 | var fs = require('fs'); | |
79 | var read = {{ mode: 'r', charset: 'utf-8' }}; | |
80 | var write = {{ mode: 'w', charset: 'utf-8' }}; | |
81 | JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ | |
82 | phantom.addCookie(x); | |
83 | }}); | |
84 | page.settings.resourceTimeout = {timeout}; | |
85 | page.settings.userAgent = "{ua}"; | |
86 | page.onLoadStarted = function() {{ | |
87 | page.evaluate(function() {{ | |
88 | delete window._phantom; | |
89 | delete window.callPhantom; | |
90 | }}); | |
91 | }}; | |
92 | var saveAndExit = function() {{ | |
93 | fs.write("{html}", page.content, write); | |
94 | fs.write("{cookies}", JSON.stringify(phantom.cookies), write); | |
95 | phantom.exit(); | |
96 | }}; | |
97 | page.onLoadFinished = function(status) {{ | |
98 | if(page.url === "") {{ | |
99 | page.setContent(fs.read("{html}", read), "{url}"); | |
100 | }} | |
101 | else {{ | |
102 | {jscode} | |
103 | }} | |
104 | }}; | |
105 | page.open(""); | |
106 | ''' | |
107 | ||
108 | _TMP_FILE_NAMES = ['script', 'html', 'cookies'] | |
109 | ||
110 | @staticmethod | |
111 | def _version(): | |
112 | return get_exe_version('phantomjs', version_re=r'([0-9.]+)') | |
113 | ||
114 | def __init__(self, extractor, required_version=None, timeout=10000): | |
a7541958 YCH |
115 | self._TMP_FILES = {} |
116 | ||
4c54b89e S |
117 | self.exe = check_executable('phantomjs', ['-v']) |
118 | if not self.exe: | |
119 | raise ExtractorError('PhantomJS executable not found in PATH, ' | |
120 | 'download it from http://phantomjs.org', | |
121 | expected=True) | |
122 | ||
123 | self.extractor = extractor | |
124 | ||
125 | if required_version: | |
126 | version = self._version() | |
127 | if is_outdated_version(version, required_version): | |
128 | self.extractor._downloader.report_warning( | |
129 | 'Your copy of PhantomJS is outdated, update it to version ' | |
130 | '%s or newer if you encounter any errors.' % required_version) | |
131 | ||
132 | self.options = { | |
133 | 'timeout': timeout, | |
134 | } | |
4c54b89e S |
135 | for name in self._TMP_FILE_NAMES: |
136 | tmp = tempfile.NamedTemporaryFile(delete=False) | |
137 | tmp.close() | |
138 | self._TMP_FILES[name] = tmp | |
139 | ||
140 | def __del__(self): | |
141 | for name in self._TMP_FILE_NAMES: | |
142 | try: | |
143 | os.remove(self._TMP_FILES[name].name) | |
a7541958 | 144 | except (IOError, OSError, KeyError): |
4c54b89e S |
145 | pass |
146 | ||
147 | def _save_cookies(self, url): | |
148 | cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) | |
149 | for cookie in cookies: | |
150 | if 'path' not in cookie: | |
151 | cookie['path'] = '/' | |
152 | if 'domain' not in cookie: | |
153 | cookie['domain'] = compat_urlparse.urlparse(url).netloc | |
154 | with open(self._TMP_FILES['cookies'].name, 'wb') as f: | |
155 | f.write(json.dumps(cookies).encode('utf-8')) | |
156 | ||
157 | def _load_cookies(self): | |
158 | with open(self._TMP_FILES['cookies'].name, 'rb') as f: | |
159 | cookies = json.loads(f.read().decode('utf-8')) | |
160 | for cookie in cookies: | |
161 | if cookie['httponly'] is True: | |
162 | cookie['rest'] = {'httpOnly': None} | |
163 | if 'expiry' in cookie: | |
164 | cookie['expire_time'] = cookie['expiry'] | |
011da618 | 165 | self.extractor._set_cookie(**compat_kwargs(cookie)) |
4c54b89e S |
166 | |
167 | def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): | |
168 | """ | |
169 | Downloads webpage (if needed) and executes JS | |
170 | ||
171 | Params: | |
172 | url: website url | |
173 | html: optional, html code of website | |
174 | video_id: video id | |
175 | note: optional, displayed when downloading webpage | |
176 | note2: optional, displayed when executing JS | |
177 | headers: custom http headers | |
178 | jscode: code to be executed when page is loaded | |
179 | ||
180 | Returns tuple with: | |
181 | * downloaded website (after JS execution) | |
182 | * anything you print with `console.log` (but not inside `page.execute`!) | |
183 | ||
184 | In most cases you don't need to add any `jscode`. | |
185 | It is executed in `page.onLoadFinished`. | |
186 | `saveAndExit();` is mandatory, use it instead of `phantom.exit()` | |
187 | It is possible to wait for some element on the webpage, for example: | |
188 | var check = function() { | |
189 | var elementFound = page.evaluate(function() { | |
190 | return document.querySelector('#b.done') !== null; | |
191 | }); | |
192 | if(elementFound) | |
193 | saveAndExit(); | |
194 | else | |
195 | window.setTimeout(check, 500); | |
196 | } | |
197 | ||
198 | page.evaluate(function(){ | |
199 | document.querySelector('#a').click(); | |
200 | }); | |
201 | check(); | |
202 | """ | |
203 | if 'saveAndExit();' not in jscode: | |
204 | raise ExtractorError('`saveAndExit();` not found in `jscode`') | |
205 | if not html: | |
206 | html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) | |
207 | with open(self._TMP_FILES['html'].name, 'wb') as f: | |
208 | f.write(html.encode('utf-8')) | |
209 | ||
210 | self._save_cookies(url) | |
211 | ||
212 | replaces = self.options | |
213 | replaces['url'] = url | |
214 | user_agent = headers.get('User-Agent') or std_headers['User-Agent'] | |
215 | replaces['ua'] = user_agent.replace('"', '\\"') | |
216 | replaces['jscode'] = jscode | |
217 | ||
218 | for x in self._TMP_FILE_NAMES: | |
219 | replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') | |
220 | ||
221 | with open(self._TMP_FILES['script'].name, 'wb') as f: | |
222 | f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) | |
223 | ||
224 | if video_id is None: | |
225 | self.extractor.to_screen('%s' % (note2,)) | |
226 | else: | |
227 | self.extractor.to_screen('%s: %s' % (video_id, note2)) | |
228 | ||
229 | p = subprocess.Popen([ | |
230 | self.exe, '--ssl-protocol=any', | |
231 | self._TMP_FILES['script'].name | |
232 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
233 | out, err = p.communicate() | |
234 | if p.returncode != 0: | |
235 | raise ExtractorError( | |
236 | 'Executing JS failed\n:' + encodeArgument(err)) | |
237 | with open(self._TMP_FILES['html'].name, 'rb') as f: | |
238 | html = f.read().decode('utf-8') | |
239 | ||
240 | self._load_cookies() | |
241 | ||
242 | return (html, encodeArgument(out)) | |
243 | ||
244 | ||
2bfeee69 | 245 | class OpenloadIE(InfoExtractor): |
e944737c | 246 | _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' |
2bfeee69 | 247 | |
9e3c2f1d | 248 | _TESTS = [{ |
2bfeee69 YCH |
249 | 'url': 'https://openload.co/f/kUEfGclsU9o', |
250 | 'md5': 'bf1c059b004ebc7a256f89408e65c36e', | |
251 | 'info_dict': { | |
252 | 'id': 'kUEfGclsU9o', | |
253 | 'ext': 'mp4', | |
254 | 'title': 'skyrim_no-audio_1080.mp4', | |
ec85ded8 | 255 | 'thumbnail': r're:^https?://.*\.jpg$', |
2bfeee69 | 256 | }, |
0711995b YCH |
257 | }, { |
258 | 'url': 'https://openload.co/embed/rjC09fkPLYs', | |
259 | 'info_dict': { | |
260 | 'id': 'rjC09fkPLYs', | |
261 | 'ext': 'mp4', | |
262 | 'title': 'movie.mp4', | |
ec85ded8 | 263 | 'thumbnail': r're:^https?://.*\.jpg$', |
0711995b YCH |
264 | 'subtitles': { |
265 | 'en': [{ | |
266 | 'ext': 'vtt', | |
267 | }], | |
268 | }, | |
269 | }, | |
270 | 'params': { | |
271 | 'skip_download': True, # test subtitles only | |
272 | }, | |
9e3c2f1d YCH |
273 | }, { |
274 | 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', | |
275 | 'only_matching': True, | |
276 | }, { | |
277 | 'url': 'https://openload.io/f/ZAn6oz-VZGE/', | |
278 | 'only_matching': True, | |
21efee5f N |
279 | }, { |
280 | 'url': 'https://openload.co/f/_-ztPaZtMhM/', | |
281 | 'only_matching': True, | |
e9063b5d S |
282 | }, { |
283 | # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout | |
284 | # for title and ext | |
285 | 'url': 'https://openload.co/embed/Sxz5sADo82g/', | |
286 | 'only_matching': True, | |
d2c5b5a9 S |
287 | }, { |
288 | # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available | |
289 | # via https://openload.co/f/e-Ixz9ZR5L0/ | |
290 | 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', | |
291 | 'only_matching': True, | |
5c32a5be YCH |
292 | }, { |
293 | 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', | |
294 | 'only_matching': True, | |
9ef909f2 | 295 | }, { |
296 | 'url': 'http://www.openload.link/f/KnG-kKZdcfY', | |
297 | 'only_matching': True, | |
273c23d9 S |
298 | }, { |
299 | 'url': 'https://oload.stream/f/KnG-kKZdcfY', | |
300 | 'only_matching': True, | |
e944737c S |
301 | }, { |
302 | 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', | |
303 | 'only_matching': True, | |
9e3c2f1d | 304 | }] |
2bfeee69 | 305 | |
da57ebaf | 306 | _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' |
47e0cef4 | 307 | |
17f8deeb S |
308 | @staticmethod |
309 | def _extract_urls(webpage): | |
310 | return re.findall( | |
311 | r'<iframe[^>]+src=["\']((?:https?://)?(?:openload\.(?:co|io)|oload\.tv)/embed/[a-zA-Z0-9-_]+)', | |
312 | webpage) | |
313 | ||
2bfeee69 YCH |
314 | def _real_extract(self, url): |
315 | video_id = self._match_id(url) | |
d2c5b5a9 | 316 | url_pattern = 'https://openload.co/%%s/%s/' % video_id |
da57ebaf TF |
317 | headers = { |
318 | 'User-Agent': self._USER_AGENT, | |
319 | } | |
320 | ||
d2c5b5a9 S |
321 | for path in ('embed', 'f'): |
322 | page_url = url_pattern % path | |
323 | last = path == 'f' | |
324 | webpage = self._download_webpage( | |
325 | page_url, video_id, 'Downloading %s webpage' % path, | |
326 | headers=headers, fatal=last) | |
327 | if not webpage: | |
328 | continue | |
329 | if 'File not found' in webpage or 'deleted by the owner' in webpage: | |
330 | if not last: | |
331 | continue | |
332 | raise ExtractorError('File not found', expected=True, video_id=video_id) | |
333 | break | |
47e0cef4 | 334 | |
7552f963 | 335 | phantom = PhantomJSwrapper(self, required_version='2.0') |
d2c5b5a9 | 336 | webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) |
fcace2d1 | 337 | |
620ee871 | 338 | decoded_id = (get_element_by_id('streamurl', webpage) or |
de329f64 | 339 | get_element_by_id('streamuri', webpage) or |
235d828b S |
340 | get_element_by_id('streamurj', webpage) or |
341 | self._search_regex( | |
fdfb32a0 | 342 | (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', |
76030543 AM |
343 | r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', |
344 | r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', | |
345 | r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', | |
346 | r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, | |
235d828b | 347 | 'stream URL')) |
47e0cef4 TF |
348 | |
349 | video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id | |
a1394b82 | 350 | |
da57ebaf TF |
351 | title = self._og_search_title(webpage, default=None) or self._search_regex( |
352 | r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, | |
353 | 'title', default=None) or self._html_search_meta( | |
354 | 'description', webpage, 'title', fatal=True) | |
355 | ||
d2c5b5a9 | 356 | entries = self._parse_html5_media_entries(page_url, webpage, video_id) |
47e0cef4 TF |
357 | entry = entries[0] if entries else {} |
358 | subtitles = entry.get('subtitles') | |
0711995b YCH |
359 | |
360 | info_dict = { | |
2bfeee69 | 361 | 'id': video_id, |
a1394b82 | 362 | 'title': title, |
47e0cef4 | 363 | 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), |
2bfeee69 | 364 | 'url': video_url, |
c1decda5 | 365 | # Seems all videos have extensions in their titles |
c1fa3f46 | 366 | 'ext': determine_ext(title, 'mp4'), |
0711995b | 367 | 'subtitles': subtitles, |
da57ebaf | 368 | 'http_headers': headers, |
2bfeee69 | 369 | } |
0711995b | 370 | return info_dict |