youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import io
   7 import os
   8 import re
   9 import shutil
  10 import socket
  11 import sys
  12 import time
  13 import traceback
  14
  15 from .utils import *
  16 from .extractor import get_info_extractor, gen_extractors
  17 from .FileDownloader import FileDownloader
  18
  19
  20 class YoutubeDL(object):
  21     """YoutubeDL class.
  22
  23     YoutubeDL objects are the ones responsible of downloading the
  24     actual video file and writing it to disk if the user has requested
  25     it, among some other tasks. In most cases there should be one per
  26     program. As, given a video URL, the downloader doesn't know how to
  27     extract all the needed information, task that InfoExtractors do, it
  28     has to pass the URL to one of them.
  29
  30     For this, YoutubeDL objects have a method that allows
  31     InfoExtractors to be registered in a given order. When it is passed
  32     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  33     finds that reports being able to handle it. The InfoExtractor extracts
  34     all the information about the video or videos the URL refers to, and
  35     YoutubeDL process the extracted information, possibly using a File
  36     Downloader to download the video.
  37
  38     YoutubeDL objects accept a lot of parameters. In order not to saturate
  39     the object constructor with arguments, it receives a dictionary of
  40     options instead. These options are available through the params
  41     attribute for the InfoExtractors to use. The YoutubeDL also
  42     registers itself as the downloader in charge for the InfoExtractors
  43     that are added to it, so this is a "mutual registration".
  44
  45     Available options:
  46
  47     username:          Username for authentication purposes.
  48     password:          Password for authentication purposes.
  49     videopassword:     Password for acces a video.
  50     usenetrc:          Use netrc for authentication instead.
  51     verbose:           Print additional info to stdout.
  52     quiet:             Do not print messages to stdout.
  53     forceurl:          Force printing final URL.
  54     forcetitle:        Force printing title.
  55     forceid:           Force printing ID.
  56     forcethumbnail:    Force printing thumbnail URL.
  57     forcedescription:  Force printing description.
  58     forcefilename:     Force printing final filename.
  59     simulate:          Do not download the video files.
  60     format:            Video format code.
  61     format_limit:      Highest quality format to try.
  62     outtmpl:           Template for output names.
  63     restrictfilenames: Do not allow "&" and spaces in file names
  64     ignoreerrors:      Do not stop on download errors.
  65     nooverwrites:      Prevent overwriting files.
  66     playliststart:     Playlist item to start at.
  67     playlistend:       Playlist item to end at.
  68     matchtitle:        Download only matching titles.
  69     rejecttitle:       Reject downloads for matching titles.
  70     logtostderr:       Log messages to stderr instead of stdout.
  71     writedescription:  Write the video description to a .description file
  72     writeinfojson:     Write the video description to a .info.json file
  73     writethumbnail:    Write the thumbnail image to a file
  74     writesubtitles:    Write the video subtitles to a file
  75     writeautomaticsub: Write the automatic subtitles to a file
  76     allsubtitles:      Downloads all the subtitles of the video
  77                        (requires writesubtitles or writeautomaticsub)
  78     listsubtitles:     Lists all available subtitles for the video
  79     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
  80     subtitleslangs:    List of languages of the subtitles to download
  81     keepvideo:         Keep the video file after post-processing
  82     daterange:         A DateRange object, download only if the upload_date is in the range.
  83     skip_download:     Skip the actual download of the video file
  84
  85     The following parameters are not used by YoutubeDL itself, they are used by
  86     the FileDownloader:
  87     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
  88     noresizebuffer, retries, continuedl, noprogress, consoletitle
  89     """
  90
  91     params = None
  92     _ies = []
  93     _pps = []
  94     _download_retcode = None
  95     _num_downloads = None
  96     _screen_file = None
  97
  98     def __init__(self, params):
  99         """Create a FileDownloader object with the given options."""
 100         self._ies = []
 101         self._ies_instances = {}
 102         self._pps = []
 103         self._progress_hooks = []
 104         self._download_retcode = 0
 105         self._num_downloads = 0
 106         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 107         self.params = params
 108         self.fd = FileDownloader(self, self.params)
 109
 110         if '%(stitle)s' in self.params['outtmpl']:
 111             self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 112
 113     def add_info_extractor(self, ie):
 114         """Add an InfoExtractor object to the end of the list."""
 115         self._ies.append(ie)
 116         self._ies_instances[ie.ie_key()] = ie
 117         ie.set_downloader(self)
 118
 119     def get_info_extractor(self, ie_key):
 120         """
 121         Get an instance of an IE with name ie_key, it will try to get one from
 122         the _ies list, if there's no instance it will create a new one and add
 123         it to the extractor list.
 124         """
 125         ie = self._ies_instances.get(ie_key)
 126         if ie is None:
 127             ie = get_info_extractor(ie_key)()
 128             self.add_info_extractor(ie)
 129         return ie
 130
 131     def add_default_info_extractors(self):
 132         """
 133         Add the InfoExtractors returned by gen_extractors to the end of the list
 134         """
 135         for ie in gen_extractors():
 136             self.add_info_extractor(ie)
 137
 138     def add_post_processor(self, pp):
 139         """Add a PostProcessor object to the end of the chain."""
 140         self._pps.append(pp)
 141         pp.set_downloader(self)
 142
 143     def to_screen(self, message, skip_eol=False):
 144         """Print message to stdout if not in quiet mode."""
 145         if not self.params.get('quiet', False):
 146             terminator = [u'\n', u''][skip_eol]
 147             output = message + terminator
 148             write_string(output, self._screen_file)
 149
 150     def to_stderr(self, message):
 151         """Print message to stderr."""
 152         assert type(message) == type(u'')
 153         output = message + u'\n'
 154         if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 155             output = output.encode(preferredencoding())
 156         sys.stderr.write(output)
 157
 158     def fixed_template(self):
 159         """Checks if the output template is fixed."""
 160         return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
 161
 162     def trouble(self, message=None, tb=None):
 163         """Determine action to take when a download problem appears.
 164
 165         Depending on if the downloader has been configured to ignore
 166         download errors or not, this method may throw an exception or
 167         not when errors are found, after printing the message.
 168
 169         tb, if given, is additional traceback information.
 170         """
 171         if message is not None:
 172             self.to_stderr(message)
 173         if self.params.get('verbose'):
 174             if tb is None:
 175                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 176                     tb = u''
 177                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 178                         tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 179                     tb += compat_str(traceback.format_exc())
 180                 else:
 181                     tb_data = traceback.format_list(traceback.extract_stack())
 182                     tb = u''.join(tb_data)
 183             self.to_stderr(tb)
 184         if not self.params.get('ignoreerrors', False):
 185             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 186                 exc_info = sys.exc_info()[1].exc_info
 187             else:
 188                 exc_info = sys.exc_info()
 189             raise DownloadError(message, exc_info)
 190         self._download_retcode = 1
 191
 192     def report_warning(self, message):
 193         '''
 194         Print the message to stderr, it will be prefixed with 'WARNING:'
 195         If stderr is a tty file the 'WARNING:' will be colored
 196         '''
 197         if sys.stderr.isatty() and os.name != 'nt':
 198             _msg_header=u'\033[0;33mWARNING:\033[0m'
 199         else:
 200             _msg_header=u'WARNING:'
 201         warning_message=u'%s %s' % (_msg_header,message)
 202         self.to_stderr(warning_message)
 203
 204     def report_error(self, message, tb=None):
 205         '''
 206         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 207         in red if stderr is a tty file.
 208         '''
 209         if sys.stderr.isatty() and os.name != 'nt':
 210             _msg_header = u'\033[0;31mERROR:\033[0m'
 211         else:
 212             _msg_header = u'ERROR:'
 213         error_message = u'%s %s' % (_msg_header, message)
 214         self.trouble(error_message, tb)
 215
 216     def slow_down(self, start_time, byte_counter):
 217         """Sleep if the download speed is over the rate limit."""
 218         rate_limit = self.params.get('ratelimit', None)
 219         if rate_limit is None or byte_counter == 0:
 220             return
 221         now = time.time()
 222         elapsed = now - start_time
 223         if elapsed <= 0.0:
 224             return
 225         speed = float(byte_counter) / elapsed
 226         if speed > rate_limit:
 227             time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 228
 229     def report_writedescription(self, descfn):
 230         """ Report that the description file is being written """
 231         self.to_screen(u'[info] Writing video description to: ' + descfn)
 232
 233     def report_writesubtitles(self, sub_filename):
 234         """ Report that the subtitles file is being written """
 235         self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
 236
 237     def report_writeinfojson(self, infofn):
 238         """ Report that the metadata file has been written """
 239         self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 240
 241     def report_file_already_downloaded(self, file_name):
 242         """Report file has already been fully downloaded."""
 243         try:
 244             self.to_screen(u'[download] %s has already been downloaded' % file_name)
 245         except (UnicodeEncodeError) as err:
 246             self.to_screen(u'[download] The file has already been downloaded')
 247
 248     def increment_downloads(self):
 249         """Increment the ordinal that assigns a number to each file."""
 250         self._num_downloads += 1
 251
 252     def prepare_filename(self, info_dict):
 253         """Generate the output filename."""
 254         try:
 255             template_dict = dict(info_dict)
 256
 257             template_dict['epoch'] = int(time.time())
 258             autonumber_size = self.params.get('autonumber_size')
 259             if autonumber_size is None:
 260                 autonumber_size = 5
 261             autonumber_templ = u'%0' + str(autonumber_size) + u'd'
 262             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 263             if template_dict['playlist_index'] is not None:
 264                 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
 265
 266             sanitize = lambda k,v: sanitize_filename(
 267                 u'NA' if v is None else compat_str(v),
 268                 restricted=self.params.get('restrictfilenames'),
 269                 is_id=(k==u'id'))
 270             template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
 271
 272             filename = self.params['outtmpl'] % template_dict
 273             return filename
 274         except KeyError as err:
 275             self.report_error(u'Erroneous output template')
 276             return None
 277         except ValueError as err:
 278             self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
 279             return None
 280
 281     def _match_entry(self, info_dict):
 282         """ Returns None iff the file should be downloaded """
 283
 284         title = info_dict['title']
 285         matchtitle = self.params.get('matchtitle', False)
 286         if matchtitle:
 287             if not re.search(matchtitle, title, re.IGNORECASE):
 288                 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 289         rejecttitle = self.params.get('rejecttitle', False)
 290         if rejecttitle:
 291             if re.search(rejecttitle, title, re.IGNORECASE):
 292                 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 293         date = info_dict.get('upload_date', None)
 294         if date is not None:
 295             dateRange = self.params.get('daterange', DateRange())
 296             if date not in dateRange:
 297                 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 298         return None
 299
 300     def extract_info(self, url, download=True, ie_key=None, extra_info={}):
 301         '''
 302         Returns a list with a dictionary for each video we find.
 303         If 'download', also downloads the videos.
 304         extra_info is a dict containing the extra values to add to each result
 305          '''
 306
 307         if ie_key:
 308             ies = [self.get_info_extractor(ie_key)]
 309         else:
 310             ies = self._ies
 311
 312         for ie in ies:
 313             if not ie.suitable(url):
 314                 continue
 315
 316             if not ie.working():
 317                 self.report_warning(u'The program functionality for this site has been marked as broken, '
 318                                     u'and will probably not work.')
 319
 320             try:
 321                 ie_result = ie.extract(url)
 322                 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
 323                     break
 324                 if isinstance(ie_result, list):
 325                     # Backwards compatibility: old IE result format
 326                     for result in ie_result:
 327                         result.update(extra_info)
 328                     ie_result = {
 329                         '_type': 'compat_list',
 330                         'entries': ie_result,
 331                     }
 332                 else:
 333                     ie_result.update(extra_info)
 334                 if 'extractor' not in ie_result:
 335                     ie_result['extractor'] = ie.IE_NAME
 336                 return self.process_ie_result(ie_result, download=download)
 337             except ExtractorError as de: # An error we somewhat expected
 338                 self.report_error(compat_str(de), de.format_traceback())
 339                 break
 340             except Exception as e:
 341                 if self.params.get('ignoreerrors', False):
 342                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 343                     break
 344                 else:
 345                     raise
 346         else:
 347             self.report_error(u'no suitable InfoExtractor: %s' % url)
 348
 349     def process_ie_result(self, ie_result, download=True, extra_info={}):
 350         """
 351         Take the result of the ie(may be modified) and resolve all unresolved
 352         references (URLs, playlist items).
 353
 354         It will also download the videos if 'download'.
 355         Returns the resolved ie_result.
 356         """
 357
 358         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
 359         if result_type == 'video':
 360             ie_result.update(extra_info)
 361             if 'playlist' not in ie_result:
 362                 # It isn't part of a playlist
 363                 ie_result['playlist'] = None
 364                 ie_result['playlist_index'] = None
 365             if download:
 366                 self.process_info(ie_result)
 367             return ie_result
 368         elif result_type == 'url':
 369             # We have to add extra_info to the results because it may be
 370             # contained in a playlist
 371             return self.extract_info(ie_result['url'],
 372                                      download,
 373                                      ie_key=ie_result.get('ie_key'),
 374                                      extra_info=extra_info)
 375         elif result_type == 'playlist':
 376             # We process each entry in the playlist
 377             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 378             self.to_screen(u'[download] Downloading playlist: %s'  % playlist)
 379
 380             playlist_results = []
 381
 382             n_all_entries = len(ie_result['entries'])
 383             playliststart = self.params.get('playliststart', 1) - 1
 384             playlistend = self.params.get('playlistend', -1)
 385
 386             if playlistend == -1:
 387                 entries = ie_result['entries'][playliststart:]
 388             else:
 389                 entries = ie_result['entries'][playliststart:playlistend]
 390
 391             n_entries = len(entries)
 392
 393             self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 394                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
 395
 396             for i,entry in enumerate(entries,1):
 397                 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
 398                 extra = {
 399                          'playlist': playlist,
 400                          'playlist_index': i + playliststart,
 401                          }
 402                 if not 'extractor' in entry:
 403                     # We set the extractor, if it's an url it will be set then to
 404                     # the new extractor, but if it's already a video we must make
 405                     # sure it's present: see issue #877
 406                     entry['extractor'] = ie_result['extractor']
 407                 entry_result = self.process_ie_result(entry,
 408                                                       download=download,
 409                                                       extra_info=extra)
 410                 playlist_results.append(entry_result)
 411             ie_result['entries'] = playlist_results
 412             return ie_result
 413         elif result_type == 'compat_list':
 414             def _fixup(r):
 415                 r.setdefault('extractor', ie_result['extractor'])
 416                 return r
 417             ie_result['entries'] = [
 418                 self.process_ie_result(_fixup(r), download=download)
 419                 for r in ie_result['entries']
 420             ]
 421             return ie_result
 422         else:
 423             raise Exception('Invalid result type: %s' % result_type)
 424
 425     def process_info(self, info_dict):
 426         """Process a single resolved IE result."""
 427
 428         assert info_dict.get('_type', 'video') == 'video'
 429         #We increment the download the download count here to match the previous behaviour.
 430         self.increment_downloads()
 431
 432         info_dict['fulltitle'] = info_dict['title']
 433         if len(info_dict['title']) > 200:
 434             info_dict['title'] = info_dict['title'][:197] + u'...'
 435
 436         # Keep for backwards compatibility
 437         info_dict['stitle'] = info_dict['title']
 438
 439         if not 'format' in info_dict:
 440             info_dict['format'] = info_dict['ext']
 441
 442         reason = self._match_entry(info_dict)
 443         if reason is not None:
 444             self.to_screen(u'[download] ' + reason)
 445             return
 446
 447         max_downloads = self.params.get('max_downloads')
 448         if max_downloads is not None:
 449             if self._num_downloads > int(max_downloads):
 450                 raise MaxDownloadsReached()
 451
 452         filename = self.prepare_filename(info_dict)
 453
 454         # Forced printings
 455         if self.params.get('forcetitle', False):
 456             compat_print(info_dict['title'])
 457         if self.params.get('forceid', False):
 458             compat_print(info_dict['id'])
 459         if self.params.get('forceurl', False):
 460             # For RTMP URLs, also include the playpath
 461             compat_print(info_dict['url'] + info_dict.get('play_path', u''))
 462         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 463             compat_print(info_dict['thumbnail'])
 464         if self.params.get('forcedescription', False) and 'description' in info_dict:
 465             compat_print(info_dict['description'])
 466         if self.params.get('forcefilename', False) and filename is not None:
 467             compat_print(filename)
 468         if self.params.get('forceformat', False):
 469             compat_print(info_dict['format'])
 470
 471         # Do nothing else if in simulate mode
 472         if self.params.get('simulate', False):
 473             return
 474
 475         if filename is None:
 476             return
 477
 478         try:
 479             dn = os.path.dirname(encodeFilename(filename))
 480             if dn != '' and not os.path.exists(dn):
 481                 os.makedirs(dn)
 482         except (OSError, IOError) as err:
 483             self.report_error(u'unable to create directory ' + compat_str(err))
 484             return
 485
 486         if self.params.get('writedescription', False):
 487             try:
 488                 descfn = filename + u'.description'
 489                 self.report_writedescription(descfn)
 490                 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 491                     descfile.write(info_dict['description'])
 492             except (KeyError, TypeError):
 493                 self.report_warning(u'There\'s no description to write.')
 494             except (OSError, IOError):
 495                 self.report_error(u'Cannot write description file ' + descfn)
 496                 return
 497
 498         subtitles_are_requested = any([self.params.get('writesubtitles', False),
 499                                        self.params.get('writeautomaticsub')])
 500
 501         if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 502             # subtitles download errors are already managed as troubles in relevant IE
 503             # that way it will silently go on when used with unsupporting IE
 504             subtitles = info_dict['subtitles']
 505             sub_format = self.params.get('subtitlesformat')
 506             for sub_lang in subtitles.keys():
 507                 sub = subtitles[sub_lang]
 508                 if sub is None:
 509                     continue
 510                 try:
 511                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 512                     self.report_writesubtitles(sub_filename)
 513                     with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 514                             subfile.write(sub)
 515                 except (OSError, IOError):
 516                     self.report_error(u'Cannot write subtitles file ' + descfn)
 517                     return
 518
 519         if self.params.get('writeinfojson', False):
 520             infofn = filename + u'.info.json'
 521             self.report_writeinfojson(infofn)
 522             try:
 523                 json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
 524                 write_json_file(json_info_dict, encodeFilename(infofn))
 525             except (OSError, IOError):
 526                 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
 527                 return
 528
 529         if self.params.get('writethumbnail', False):
 530             if info_dict.get('thumbnail') is not None:
 531                 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
 532                 thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
 533                 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
 534                                (info_dict['extractor'], info_dict['id']))
 535                 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
 536                 with open(thumb_filename, 'wb') as thumbf:
 537                     shutil.copyfileobj(uf, thumbf)
 538                 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
 539                                (info_dict['extractor'], info_dict['id'], thumb_filename))
 540
 541         if not self.params.get('skip_download', False):
 542             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
 543                 success = True
 544             else:
 545                 try:
 546                     success = self.fd._do_download(filename, info_dict)
 547                 except (OSError, IOError) as err:
 548                     raise UnavailableVideoError(err)
 549                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 550                     self.report_error(u'unable to download video data: %s' % str(err))
 551                     return
 552                 except (ContentTooShortError, ) as err:
 553                     self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 554                     return
 555
 556             if success:
 557                 try:
 558                     self.post_process(filename, info_dict)
 559                 except (PostProcessingError) as err:
 560                     self.report_error(u'postprocessing: %s' % str(err))
 561                     return
 562
 563     def download(self, url_list):
 564         """Download a given list of URLs."""
 565         if len(url_list) > 1 and self.fixed_template():
 566             raise SameFileError(self.params['outtmpl'])
 567
 568         for url in url_list:
 569             try:
 570                 #It also downloads the videos
 571                 videos = self.extract_info(url)
 572             except UnavailableVideoError:
 573                 self.report_error(u'unable to download video')
 574             except MaxDownloadsReached:
 575                 self.to_screen(u'[info] Maximum number of downloaded files reached.')
 576                 raise
 577
 578         return self._download_retcode
 579
 580     def post_process(self, filename, ie_info):
 581         """Run all the postprocessors on the given file."""
 582         info = dict(ie_info)
 583         info['filepath'] = filename
 584         keep_video = None
 585         for pp in self._pps:
 586             try:
 587                 keep_video_wish,new_info = pp.run(info)
 588                 if keep_video_wish is not None:
 589                     if keep_video_wish:
 590                         keep_video = keep_video_wish
 591                     elif keep_video is None:
 592                         # No clear decision yet, let IE decide
 593                         keep_video = keep_video_wish
 594             except PostProcessingError as e:
 595                 self.report_error(e.msg)
 596         if keep_video is False and not self.params.get('keepvideo', False):
 597             try:
 598                 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
 599                 os.remove(encodeFilename(filename))
 600             except (IOError, OSError):
 601                 self.report_warning(u'Unable to remove downloaded video file')