youtube_dl/extractor/openload.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import json
   5 import os
   6 import random
   7 import re
   8 import subprocess
   9 import tempfile
  10
  11 from .common import InfoExtractor
  12 from ..compat import (
  13     compat_urlparse,
  14     compat_kwargs,
  15 )
  16 from ..utils import (
  17     check_executable,
  18     determine_ext,
  19     encodeArgument,
  20     ExtractorError,
  21     get_element_by_id,
  22     get_exe_version,
  23     is_outdated_version,
  24     std_headers,
  25 )
  26
  27
  28 def cookie_to_dict(cookie):
  29     cookie_dict = {
  30         'name': cookie.name,
  31         'value': cookie.value,
  32     }
  33     if cookie.port_specified:
  34         cookie_dict['port'] = cookie.port
  35     if cookie.domain_specified:
  36         cookie_dict['domain'] = cookie.domain
  37     if cookie.path_specified:
  38         cookie_dict['path'] = cookie.path
  39     if cookie.expires is not None:
  40         cookie_dict['expires'] = cookie.expires
  41     if cookie.secure is not None:
  42         cookie_dict['secure'] = cookie.secure
  43     if cookie.discard is not None:
  44         cookie_dict['discard'] = cookie.discard
  45     try:
  46         if (cookie.has_nonstandard_attr('httpOnly') or
  47                 cookie.has_nonstandard_attr('httponly') or
  48                 cookie.has_nonstandard_attr('HttpOnly')):
  49             cookie_dict['httponly'] = True
  50     except TypeError:
  51         pass
  52     return cookie_dict
  53
  54
  55 def cookie_jar_to_list(cookie_jar):
  56     return [cookie_to_dict(cookie) for cookie in cookie_jar]
  57
  58
  59 class PhantomJSwrapper(object):
  60     """PhantomJS wrapper class
  61
  62     This class is experimental.
  63     """
  64
  65     _TEMPLATE = r'''
  66         phantom.onError = function(msg, trace) {{
  67           var msgStack = ['PHANTOM ERROR: ' + msg];
  68           if(trace && trace.length) {{
  69             msgStack.push('TRACE:');
  70             trace.forEach(function(t) {{
  71               msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
  72                 + (t.function ? ' (in function ' + t.function +')' : ''));
  73             }});
  74           }}
  75           console.error(msgStack.join('\n'));
  76           phantom.exit(1);
  77         }};
  78         var page = require('webpage').create();
  79         var fs = require('fs');
  80         var read = {{ mode: 'r', charset: 'utf-8' }};
  81         var write = {{ mode: 'w', charset: 'utf-8' }};
  82         JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
  83           phantom.addCookie(x);
  84         }});
  85         page.settings.resourceTimeout = {timeout};
  86         page.settings.userAgent = "{ua}";
  87         page.onLoadStarted = function() {{
  88           page.evaluate(function() {{
  89             delete window._phantom;
  90             delete window.callPhantom;
  91           }});
  92         }};
  93         var saveAndExit = function() {{
  94           fs.write("{html}", page.content, write);
  95           fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
  96           phantom.exit();
  97         }};
  98         page.onLoadFinished = function(status) {{
  99           if(page.url === "") {{
 100             page.setContent(fs.read("{html}", read), "{url}");
 101           }}
 102           else {{
 103             {jscode}
 104           }}
 105         }};
 106         page.open("");
 107     '''
 108
 109     _TMP_FILE_NAMES = ['script', 'html', 'cookies']
 110
 111     @staticmethod
 112     def _version():
 113         return get_exe_version('phantomjs', version_re=r'([0-9.]+)')
 114
 115     def __init__(self, extractor, required_version=None, timeout=10000):
 116         self._TMP_FILES = {}
 117
 118         self.exe = check_executable('phantomjs', ['-v'])
 119         if not self.exe:
 120             raise ExtractorError('PhantomJS executable not found in PATH, '
 121                                  'download it from http://phantomjs.org',
 122                                  expected=True)
 123
 124         self.extractor = extractor
 125
 126         if required_version:
 127             version = self._version()
 128             if is_outdated_version(version, required_version):
 129                 self.extractor._downloader.report_warning(
 130                     'Your copy of PhantomJS is outdated, update it to version '
 131                     '%s or newer if you encounter any errors.' % required_version)
 132
 133         self.options = {
 134             'timeout': timeout,
 135         }
 136         for name in self._TMP_FILE_NAMES:
 137             tmp = tempfile.NamedTemporaryFile(delete=False)
 138             tmp.close()
 139             self._TMP_FILES[name] = tmp
 140
 141     def __del__(self):
 142         for name in self._TMP_FILE_NAMES:
 143             try:
 144                 os.remove(self._TMP_FILES[name].name)
 145             except (IOError, OSError, KeyError):
 146                 pass
 147
 148     def _save_cookies(self, url):
 149         cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar)
 150         for cookie in cookies:
 151             if 'path' not in cookie:
 152                 cookie['path'] = '/'
 153             if 'domain' not in cookie:
 154                 cookie['domain'] = compat_urlparse.urlparse(url).netloc
 155         with open(self._TMP_FILES['cookies'].name, 'wb') as f:
 156             f.write(json.dumps(cookies).encode('utf-8'))
 157
 158     def _load_cookies(self):
 159         with open(self._TMP_FILES['cookies'].name, 'rb') as f:
 160             cookies = json.loads(f.read().decode('utf-8'))
 161         for cookie in cookies:
 162             if cookie['httponly'] is True:
 163                 cookie['rest'] = {'httpOnly': None}
 164             if 'expiry' in cookie:
 165                 cookie['expire_time'] = cookie['expiry']
 166             self.extractor._set_cookie(**compat_kwargs(cookie))
 167
 168     def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
 169         """
 170         Downloads webpage (if needed) and executes JS
 171
 172         Params:
 173             url: website url
 174             html: optional, html code of website
 175             video_id: video id
 176             note: optional, displayed when downloading webpage
 177             note2: optional, displayed when executing JS
 178             headers: custom http headers
 179             jscode: code to be executed when page is loaded
 180
 181         Returns tuple with:
 182             * downloaded website (after JS execution)
 183             * anything you print with `console.log` (but not inside `page.execute`!)
 184
 185         In most cases you don't need to add any `jscode`.
 186         It is executed in `page.onLoadFinished`.
 187         `saveAndExit();` is mandatory, use it instead of `phantom.exit()`
 188         It is possible to wait for some element on the webpage, for example:
 189             var check = function() {
 190               var elementFound = page.evaluate(function() {
 191                 return document.querySelector('#b.done') !== null;
 192               });
 193               if(elementFound)
 194                 saveAndExit();
 195               else
 196                 window.setTimeout(check, 500);
 197             }
 198
 199             page.evaluate(function(){
 200               document.querySelector('#a').click();
 201             });
 202             check();
 203         """
 204         if 'saveAndExit();' not in jscode:
 205             raise ExtractorError('`saveAndExit();` not found in `jscode`')
 206         if not html:
 207             html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
 208         with open(self._TMP_FILES['html'].name, 'wb') as f:
 209             f.write(html.encode('utf-8'))
 210
 211         self._save_cookies(url)
 212
 213         replaces = self.options
 214         replaces['url'] = url
 215         user_agent = headers.get('User-Agent') or std_headers['User-Agent']
 216         replaces['ua'] = user_agent.replace('"', '\\"')
 217         replaces['jscode'] = jscode
 218
 219         for x in self._TMP_FILE_NAMES:
 220             replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
 221
 222         with open(self._TMP_FILES['script'].name, 'wb') as f:
 223             f.write(self._TEMPLATE.format(**replaces).encode('utf-8'))
 224
 225         if video_id is None:
 226             self.extractor.to_screen('%s' % (note2,))
 227         else:
 228             self.extractor.to_screen('%s: %s' % (video_id, note2))
 229
 230         p = subprocess.Popen([
 231             self.exe, '--ssl-protocol=any',
 232             self._TMP_FILES['script'].name
 233         ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 234         out, err = p.communicate()
 235         if p.returncode != 0:
 236             raise ExtractorError(
 237                 'Executing JS failed\n:' + encodeArgument(err))
 238         with open(self._TMP_FILES['html'].name, 'rb') as f:
 239             html = f.read().decode('utf-8')
 240
 241         self._load_cookies()
 242
 243         return (html, encodeArgument(out))
 244
 245
 246 class OpenloadIE(InfoExtractor):
 247     _DOMAINS = r'(?:openload\.(?:co|io|link|pw)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live|space)|oladblock\.(?:services|xyz|me))'
 248     _VALID_URL = r'''(?x)
 249                     https?://
 250                         (?P<host>
 251                             (?:www\.)?
 252                             %s
 253                         )/
 254                         (?:f|embed)/
 255                         (?P<id>[a-zA-Z0-9-_]+)
 256                     ''' % _DOMAINS
 257
 258     _TESTS = [{
 259         'url': 'https://openload.co/f/kUEfGclsU9o',
 260         'md5': 'bf1c059b004ebc7a256f89408e65c36e',
 261         'info_dict': {
 262             'id': 'kUEfGclsU9o',
 263             'ext': 'mp4',
 264             'title': 'skyrim_no-audio_1080.mp4',
 265             'thumbnail': r're:^https?://.*\.jpg$',
 266         },
 267     }, {
 268         'url': 'https://openload.co/embed/rjC09fkPLYs',
 269         'info_dict': {
 270             'id': 'rjC09fkPLYs',
 271             'ext': 'mp4',
 272             'title': 'movie.mp4',
 273             'thumbnail': r're:^https?://.*\.jpg$',
 274             'subtitles': {
 275                 'en': [{
 276                     'ext': 'vtt',
 277                 }],
 278             },
 279         },
 280         'params': {
 281             'skip_download': True,  # test subtitles only
 282         },
 283     }, {
 284         'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4',
 285         'only_matching': True,
 286     }, {
 287         'url': 'https://openload.io/f/ZAn6oz-VZGE/',
 288         'only_matching': True,
 289     }, {
 290         'url': 'https://openload.co/f/_-ztPaZtMhM/',
 291         'only_matching': True,
 292     }, {
 293         # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout
 294         # for title and ext
 295         'url': 'https://openload.co/embed/Sxz5sADo82g/',
 296         'only_matching': True,
 297     }, {
 298         # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available
 299         # via https://openload.co/f/e-Ixz9ZR5L0/
 300         'url': 'https://openload.co/f/e-Ixz9ZR5L0/',
 301         'only_matching': True,
 302     }, {
 303         'url': 'https://oload.tv/embed/KnG-kKZdcfY/',
 304         'only_matching': True,
 305     }, {
 306         'url': 'http://www.openload.link/f/KnG-kKZdcfY',
 307         'only_matching': True,
 308     }, {
 309         'url': 'https://oload.stream/f/KnG-kKZdcfY',
 310         'only_matching': True,
 311     }, {
 312         'url': 'https://oload.xyz/f/WwRBpzW8Wtk',
 313         'only_matching': True,
 314     }, {
 315         'url': 'https://oload.win/f/kUEfGclsU9o',
 316         'only_matching': True,
 317     }, {
 318         'url': 'https://oload.download/f/kUEfGclsU9o',
 319         'only_matching': True,
 320     }, {
 321         'url': 'https://oload.cloud/f/4ZDnBXRWiB8',
 322         'only_matching': True,
 323     }, {
 324         # Its title has not got its extension but url has it
 325         'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4',
 326         'only_matching': True,
 327     }, {
 328         'url': 'https://oload.cc/embed/5NEAbI2BDSk',
 329         'only_matching': True,
 330     }, {
 331         'url': 'https://oload.icu/f/-_i4y_F_Hs8',
 332         'only_matching': True,
 333     }, {
 334         'url': 'https://oload.fun/f/gb6G1H4sHXY',
 335         'only_matching': True,
 336     }, {
 337         'url': 'https://oload.club/f/Nr1L-aZ2dbQ',
 338         'only_matching': True,
 339     }, {
 340         'url': 'https://oload.info/f/5NEAbI2BDSk',
 341         'only_matching': True,
 342     }, {
 343         'url': 'https://openload.pw/f/WyKgK8s94N0',
 344         'only_matching': True,
 345     }, {
 346         'url': 'https://oload.pw/f/WyKgK8s94N0',
 347         'only_matching': True,
 348     }, {
 349         'url': 'https://oload.live/f/-Z58UZ-GR4M',
 350         'only_matching': True,
 351     }, {
 352         'url': 'https://oload.space/f/IY4eZSst3u8/',
 353         'only_matching': True,
 354     }, {
 355         'url': 'https://oladblock.services/f/b8NWEgkqNLI/',
 356         'only_matching': True,
 357     }, {
 358         'url': 'https://oladblock.xyz/f/b8NWEgkqNLI/',
 359         'only_matching': True,
 360     }, {
 361         'url': 'https://oladblock.me/f/b8NWEgkqNLI/',
 362         'only_matching': True,
 363     }]
 364
 365     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{major}.0.{build}.{patch} Safari/537.36'
 366
 367     @staticmethod
 368     def _extract_urls(webpage):
 369         return re.findall(
 370             r'<iframe[^>]+src=["\']((?:https?://)?%s/embed/[a-zA-Z0-9-_]+)'
 371             % OpenloadIE._DOMAINS, webpage)
 372
 373     def _real_extract(self, url):
 374         mobj = re.match(self._VALID_URL, url)
 375         host = mobj.group('host')
 376         video_id = mobj.group('id')
 377
 378         url_pattern = 'https://%s/%%s/%s/' % (host, video_id)
 379         headers = {
 380             'User-Agent': self._USER_AGENT_TPL % {
 381                 'major': random.randint(63, 73),
 382                 'build': random.randint(3239, 3683),
 383                 'patch': random.randint(0, 100),
 384             },
 385         }
 386
 387         for path in ('embed', 'f'):
 388             page_url = url_pattern % path
 389             last = path == 'f'
 390             webpage = self._download_webpage(
 391                 page_url, video_id, 'Downloading %s webpage' % path,
 392                 headers=headers, fatal=last)
 393             if not webpage:
 394                 continue
 395             if 'File not found' in webpage or 'deleted by the owner' in webpage:
 396                 if not last:
 397                     continue
 398                 raise ExtractorError('File not found', expected=True, video_id=video_id)
 399             break
 400
 401         phantom = PhantomJSwrapper(self, required_version='2.0')
 402         webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers)
 403
 404         decoded_id = (get_element_by_id('streamurl', webpage) or
 405                       get_element_by_id('streamuri', webpage) or
 406                       get_element_by_id('streamurj', webpage) or
 407                       self._search_regex(
 408                           (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<',
 409                            r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)',
 410                            r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<',
 411                            r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<',
 412                            r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage,
 413                           'stream URL'))
 414
 415         video_url = 'https://%s/stream/%s?mime=true' % (host, decoded_id)
 416
 417         title = self._og_search_title(webpage, default=None) or self._search_regex(
 418             r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
 419             'title', default=None) or self._html_search_meta(
 420             'description', webpage, 'title', fatal=True)
 421
 422         entries = self._parse_html5_media_entries(page_url, webpage, video_id)
 423         entry = entries[0] if entries else {}
 424         subtitles = entry.get('subtitles')
 425
 426         return {
 427             'id': video_id,
 428             'title': title,
 429             'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None),
 430             'url': video_url,
 431             'ext': determine_ext(title, None) or determine_ext(url, 'mp4'),
 432             'subtitles': subtitles,
 433             'http_headers': headers,
 434         }