youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import errno
   8 import io
   9 import json
  10 import os
  11 import platform
  12 import re
  13 import shutil
  14 import subprocess
  15 import socket
  16 import sys
  17 import time
  18 import traceback
  19
  20 if os.name == 'nt':
  21     import ctypes
  22
  23 from .utils import (
  24     compat_cookiejar,
  25     compat_http_client,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_request,
  29     ContentTooShortError,
  30     date_from_str,
  31     DateRange,
  32     determine_ext,
  33     DownloadError,
  34     encodeFilename,
  35     ExtractorError,
  36     format_bytes,
  37     formatSeconds,
  38     get_term_width,
  39     locked_file,
  40     make_HTTPS_handler,
  41     MaxDownloadsReached,
  42     PostProcessingError,
  43     platform_name,
  44     preferredencoding,
  45     SameFileError,
  46     sanitize_filename,
  47     subtitles_filename,
  48     takewhile_inclusive,
  49     UnavailableVideoError,
  50     url_basename,
  51     write_json_file,
  52     write_string,
  53     YoutubeDLHandler,
  54     prepend_extension,
  55 )
  56 from .extractor import get_info_extractor, gen_extractors
  57 from .downloader import get_suitable_downloader
  58 from .postprocessor import FFmpegMergerPP
  59 from .version import __version__
  60
  61
  62 class YoutubeDL(object):
  63     """YoutubeDL class.
  64
  65     YoutubeDL objects are the ones responsible of downloading the
  66     actual video file and writing it to disk if the user has requested
  67     it, among some other tasks. In most cases there should be one per
  68     program. As, given a video URL, the downloader doesn't know how to
  69     extract all the needed information, task that InfoExtractors do, it
  70     has to pass the URL to one of them.
  71
  72     For this, YoutubeDL objects have a method that allows
  73     InfoExtractors to be registered in a given order. When it is passed
  74     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  75     finds that reports being able to handle it. The InfoExtractor extracts
  76     all the information about the video or videos the URL refers to, and
  77     YoutubeDL process the extracted information, possibly using a File
  78     Downloader to download the video.
  79
  80     YoutubeDL objects accept a lot of parameters. In order not to saturate
  81     the object constructor with arguments, it receives a dictionary of
  82     options instead. These options are available through the params
  83     attribute for the InfoExtractors to use. The YoutubeDL also
  84     registers itself as the downloader in charge for the InfoExtractors
  85     that are added to it, so this is a "mutual registration".
  86
  87     Available options:
  88
  89     username:          Username for authentication purposes.
  90     password:          Password for authentication purposes.
  91     videopassword:     Password for acces a video.
  92     usenetrc:          Use netrc for authentication instead.
  93     verbose:           Print additional info to stdout.
  94     quiet:             Do not print messages to stdout.
  95     forceurl:          Force printing final URL.
  96     forcetitle:        Force printing title.
  97     forceid:           Force printing ID.
  98     forcethumbnail:    Force printing thumbnail URL.
  99     forcedescription:  Force printing description.
 100     forcefilename:     Force printing final filename.
 101     forceduration:     Force printing duration.
 102     forcejson:         Force printing info_dict as JSON.
 103     simulate:          Do not download the video files.
 104     format:            Video format code.
 105     format_limit:      Highest quality format to try.
 106     outtmpl:           Template for output names.
 107     restrictfilenames: Do not allow "&" and spaces in file names
 108     ignoreerrors:      Do not stop on download errors.
 109     nooverwrites:      Prevent overwriting files.
 110     playliststart:     Playlist item to start at.
 111     playlistend:       Playlist item to end at.
 112     matchtitle:        Download only matching titles.
 113     rejecttitle:       Reject downloads for matching titles.
 114     logger:            Log messages to a logging.Logger instance.
 115     logtostderr:       Log messages to stderr instead of stdout.
 116     writedescription:  Write the video description to a .description file
 117     writeinfojson:     Write the video description to a .info.json file
 118     writeannotations:  Write the video annotations to a .annotations.xml file
 119     writethumbnail:    Write the thumbnail image to a file
 120     writesubtitles:    Write the video subtitles to a file
 121     writeautomaticsub: Write the automatic subtitles to a file
 122     allsubtitles:      Downloads all the subtitles of the video
 123                        (requires writesubtitles or writeautomaticsub)
 124     listsubtitles:     Lists all available subtitles for the video
 125     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 126     subtitleslangs:    List of languages of the subtitles to download
 127     keepvideo:         Keep the video file after post-processing
 128     daterange:         A DateRange object, download only if the upload_date is in the range.
 129     skip_download:     Skip the actual download of the video file
 130     cachedir:          Location of the cache files in the filesystem.
 131                        None to disable filesystem cache.
 132     noplaylist:        Download single video instead of a playlist if in doubt.
 133     age_limit:         An integer representing the user's age in years.
 134                        Unsuitable videos for the given age are skipped.
 135     min_views:         An integer representing the minimum view count the video
 136                        must have in order to not be skipped.
 137                        Videos without view count information are always
 138                        downloaded. None for no limit.
 139     max_views:         An integer representing the maximum view count.
 140                        Videos that are more popular than that are not
 141                        downloaded.
 142                        Videos without view count information are always
 143                        downloaded. None for no limit.
 144     download_archive:  File name of a file where all downloads are recorded.
 145                        Videos already present in the file are not downloaded
 146                        again.
 147     cookiefile:        File name where cookies should be read from and dumped to.
 148     nocheckcertificate:Do not verify SSL certificates
 149     proxy:             URL of the proxy server to use
 150     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 151     bidi_workaround:   Work around buggy terminals without bidirectional text
 152                        support, using fridibi
 153     debug_printtraffic:Print out sent and received HTTP traffic
 154
 155     The following parameters are not used by YoutubeDL itself, they are used by
 156     the FileDownloader:
 157     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 158     noresizebuffer, retries, continuedl, noprogress, consoletitle
 159
 160     The following options are used by the post processors:
 161     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 162                        otherwise prefer avconv.
 163     """
 164
 165     params = None
 166     _ies = []
 167     _pps = []
 168     _download_retcode = None
 169     _num_downloads = None
 170     _screen_file = None
 171
 172     def __init__(self, params=None):
 173         """Create a FileDownloader object with the given options."""
 174         if params is None:
 175             params = {}
 176         self._ies = []
 177         self._ies_instances = {}
 178         self._pps = []
 179         self._progress_hooks = []
 180         self._download_retcode = 0
 181         self._num_downloads = 0
 182         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 183         self._err_file = sys.stderr
 184         self.params = params
 185
 186         if params.get('bidi_workaround', False):
 187             try:
 188                 import pty
 189                 master, slave = pty.openpty()
 190                 width = get_term_width()
 191                 if width is None:
 192                     width_args = []
 193                 else:
 194                     width_args = ['-w', str(width)]
 195                 sp_kwargs = dict(
 196                     stdin=subprocess.PIPE,
 197                     stdout=slave,
 198                     stderr=self._err_file)
 199                 try:
 200                     self._output_process = subprocess.Popen(
 201                         ['bidiv'] + width_args, **sp_kwargs
 202                     )
 203                 except OSError:
 204                     self._output_process = subprocess.Popen(
 205                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 206                 self._output_channel = os.fdopen(master, 'rb')
 207             except OSError as ose:
 208                 if ose.errno == 2:
 209                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 210                 else:
 211                     raise
 212
 213         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 214                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 215                 and not params['restrictfilenames']):
 216             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 217             self.report_warning(
 218                 'Assuming --restrict-filenames since file system encoding '
 219                 'cannot encode all charactes. '
 220                 'Set the LC_ALL environment variable to fix this.')
 221             self.params['restrictfilenames'] = True
 222
 223         if '%(stitle)s' in self.params.get('outtmpl', ''):
 224             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 225
 226         self._setup_opener()
 227
 228     def add_info_extractor(self, ie):
 229         """Add an InfoExtractor object to the end of the list."""
 230         self._ies.append(ie)
 231         self._ies_instances[ie.ie_key()] = ie
 232         ie.set_downloader(self)
 233
 234     def get_info_extractor(self, ie_key):
 235         """
 236         Get an instance of an IE with name ie_key, it will try to get one from
 237         the _ies list, if there's no instance it will create a new one and add
 238         it to the extractor list.
 239         """
 240         ie = self._ies_instances.get(ie_key)
 241         if ie is None:
 242             ie = get_info_extractor(ie_key)()
 243             self.add_info_extractor(ie)
 244         return ie
 245
 246     def add_default_info_extractors(self):
 247         """
 248         Add the InfoExtractors returned by gen_extractors to the end of the list
 249         """
 250         for ie in gen_extractors():
 251             self.add_info_extractor(ie)
 252
 253     def add_post_processor(self, pp):
 254         """Add a PostProcessor object to the end of the chain."""
 255         self._pps.append(pp)
 256         pp.set_downloader(self)
 257
 258     def add_progress_hook(self, ph):
 259         """Add the progress hook (currently only for the file downloader)"""
 260         self._progress_hooks.append(ph)
 261
 262     def _bidi_workaround(self, message):
 263         if not hasattr(self, '_output_channel'):
 264             return message
 265
 266         assert hasattr(self, '_output_process')
 267         assert type(message) == type('')
 268         line_count = message.count('\n') + 1
 269         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 270         self._output_process.stdin.flush()
 271         res = ''.join(self._output_channel.readline().decode('utf-8')
 272                        for _ in range(line_count))
 273         return res[:-len('\n')]
 274
 275     def to_screen(self, message, skip_eol=False):
 276         """Print message to stdout if not in quiet mode."""
 277         return self.to_stdout(message, skip_eol, check_quiet=True)
 278
 279     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 280         """Print message to stdout if not in quiet mode."""
 281         if self.params.get('logger'):
 282             self.params['logger'].debug(message)
 283         elif not check_quiet or not self.params.get('quiet', False):
 284             message = self._bidi_workaround(message)
 285             terminator = ['\n', ''][skip_eol]
 286             output = message + terminator
 287
 288             write_string(output, self._screen_file)
 289
 290     def to_stderr(self, message):
 291         """Print message to stderr."""
 292         assert type(message) == type('')
 293         if self.params.get('logger'):
 294             self.params['logger'].error(message)
 295         else:
 296             message = self._bidi_workaround(message)
 297             output = message + '\n'
 298             write_string(output, self._err_file)
 299
 300     def to_console_title(self, message):
 301         if not self.params.get('consoletitle', False):
 302             return
 303         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 304             # c_wchar_p() might not be necessary if `message` is
 305             # already of type unicode()
 306             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 307         elif 'TERM' in os.environ:
 308             write_string('\033]0;%s\007' % message, self._screen_file)
 309
 310     def save_console_title(self):
 311         if not self.params.get('consoletitle', False):
 312             return
 313         if 'TERM' in os.environ:
 314             # Save the title on stack
 315             write_string('\033[22;0t', self._screen_file)
 316
 317     def restore_console_title(self):
 318         if not self.params.get('consoletitle', False):
 319             return
 320         if 'TERM' in os.environ:
 321             # Restore the title from stack
 322             write_string('\033[23;0t', self._screen_file)
 323
 324     def __enter__(self):
 325         self.save_console_title()
 326         return self
 327
 328     def __exit__(self, *args):
 329         self.restore_console_title()
 330
 331         if self.params.get('cookiefile') is not None:
 332             self.cookiejar.save()
 333
 334     def trouble(self, message=None, tb=None):
 335         """Determine action to take when a download problem appears.
 336
 337         Depending on if the downloader has been configured to ignore
 338         download errors or not, this method may throw an exception or
 339         not when errors are found, after printing the message.
 340
 341         tb, if given, is additional traceback information.
 342         """
 343         if message is not None:
 344             self.to_stderr(message)
 345         if self.params.get('verbose'):
 346             if tb is None:
 347                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 348                     tb = ''
 349                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 350                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 351                     tb += compat_str(traceback.format_exc())
 352                 else:
 353                     tb_data = traceback.format_list(traceback.extract_stack())
 354                     tb = ''.join(tb_data)
 355             self.to_stderr(tb)
 356         if not self.params.get('ignoreerrors', False):
 357             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 358                 exc_info = sys.exc_info()[1].exc_info
 359             else:
 360                 exc_info = sys.exc_info()
 361             raise DownloadError(message, exc_info)
 362         self._download_retcode = 1
 363
 364     def report_warning(self, message):
 365         '''
 366         Print the message to stderr, it will be prefixed with 'WARNING:'
 367         If stderr is a tty file the 'WARNING:' will be colored
 368         '''
 369         if self._err_file.isatty() and os.name != 'nt':
 370             _msg_header = '\033[0;33mWARNING:\033[0m'
 371         else:
 372             _msg_header = 'WARNING:'
 373         warning_message = '%s %s' % (_msg_header, message)
 374         self.to_stderr(warning_message)
 375
 376     def report_error(self, message, tb=None):
 377         '''
 378         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 379         in red if stderr is a tty file.
 380         '''
 381         if self._err_file.isatty() and os.name != 'nt':
 382             _msg_header = '\033[0;31mERROR:\033[0m'
 383         else:
 384             _msg_header = 'ERROR:'
 385         error_message = '%s %s' % (_msg_header, message)
 386         self.trouble(error_message, tb)
 387
 388     def report_file_already_downloaded(self, file_name):
 389         """Report file has already been fully downloaded."""
 390         try:
 391             self.to_screen('[download] %s has already been downloaded' % file_name)
 392         except UnicodeEncodeError:
 393             self.to_screen('[download] The file has already been downloaded')
 394
 395     def increment_downloads(self):
 396         """Increment the ordinal that assigns a number to each file."""
 397         self._num_downloads += 1
 398
 399     def prepare_filename(self, info_dict):
 400         """Generate the output filename."""
 401         try:
 402             template_dict = dict(info_dict)
 403
 404             template_dict['epoch'] = int(time.time())
 405             autonumber_size = self.params.get('autonumber_size')
 406             if autonumber_size is None:
 407                 autonumber_size = 5
 408             autonumber_templ = '%0' + str(autonumber_size) + 'd'
 409             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 410             if template_dict.get('playlist_index') is not None:
 411                 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
 412
 413             sanitize = lambda k, v: sanitize_filename(
 414                 compat_str(v),
 415                 restricted=self.params.get('restrictfilenames'),
 416                 is_id=(k == 'id'))
 417             template_dict = dict((k, sanitize(k, v))
 418                                  for k, v in template_dict.items()
 419                                  if v is not None)
 420             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 421
 422             tmpl = os.path.expanduser(self.params['outtmpl'])
 423             filename = tmpl % template_dict
 424             return filename
 425         except ValueError as err:
 426             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 427             return None
 428
 429     def _match_entry(self, info_dict):
 430         """ Returns None iff the file should be downloaded """
 431
 432         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 433         if 'title' in info_dict:
 434             # This can happen when we're just evaluating the playlist
 435             title = info_dict['title']
 436             matchtitle = self.params.get('matchtitle', False)
 437             if matchtitle:
 438                 if not re.search(matchtitle, title, re.IGNORECASE):
 439                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 440             rejecttitle = self.params.get('rejecttitle', False)
 441             if rejecttitle:
 442                 if re.search(rejecttitle, title, re.IGNORECASE):
 443                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 444         date = info_dict.get('upload_date', None)
 445         if date is not None:
 446             dateRange = self.params.get('daterange', DateRange())
 447             if date not in dateRange:
 448                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 449         view_count = info_dict.get('view_count', None)
 450         if view_count is not None:
 451             min_views = self.params.get('min_views')
 452             if min_views is not None and view_count < min_views:
 453                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 454             max_views = self.params.get('max_views')
 455             if max_views is not None and view_count > max_views:
 456                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 457         age_limit = self.params.get('age_limit')
 458         if age_limit is not None:
 459             if age_limit < info_dict.get('age_limit', 0):
 460                 return 'Skipping "' + title + '" because it is age restricted'
 461         if self.in_download_archive(info_dict):
 462             return '%s has already been recorded in archive' % video_title
 463         return None
 464
 465     @staticmethod
 466     def add_extra_info(info_dict, extra_info):
 467         '''Set the keys from extra_info in info dict if they are missing'''
 468         for key, value in extra_info.items():
 469             info_dict.setdefault(key, value)
 470
 471     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 472                      process=True):
 473         '''
 474         Returns a list with a dictionary for each video we find.
 475         If 'download', also downloads the videos.
 476         extra_info is a dict containing the extra values to add to each result
 477          '''
 478
 479         if ie_key:
 480             ies = [self.get_info_extractor(ie_key)]
 481         else:
 482             ies = self._ies
 483
 484         for ie in ies:
 485             if not ie.suitable(url):
 486                 continue
 487
 488             if not ie.working():
 489                 self.report_warning('The program functionality for this site has been marked as broken, '
 490                                     'and will probably not work.')
 491
 492             try:
 493                 ie_result = ie.extract(url)
 494                 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
 495                     break
 496                 if isinstance(ie_result, list):
 497                     # Backwards compatibility: old IE result format
 498                     ie_result = {
 499                         '_type': 'compat_list',
 500                         'entries': ie_result,
 501                     }
 502                 self.add_extra_info(ie_result,
 503                     {
 504                         'extractor': ie.IE_NAME,
 505                         'webpage_url': url,
 506                         'webpage_url_basename': url_basename(url),
 507                         'extractor_key': ie.ie_key(),
 508                     })
 509                 if process:
 510                     return self.process_ie_result(ie_result, download, extra_info)
 511                 else:
 512                     return ie_result
 513             except ExtractorError as de: # An error we somewhat expected
 514                 self.report_error(compat_str(de), de.format_traceback())
 515                 break
 516             except Exception as e:
 517                 if self.params.get('ignoreerrors', False):
 518                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 519                     break
 520                 else:
 521                     raise
 522         else:
 523             self.report_error('no suitable InfoExtractor: %s' % url)
 524
 525     def process_ie_result(self, ie_result, download=True, extra_info={}):
 526         """
 527         Take the result of the ie(may be modified) and resolve all unresolved
 528         references (URLs, playlist items).
 529
 530         It will also download the videos if 'download'.
 531         Returns the resolved ie_result.
 532         """
 533
 534         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
 535         if result_type == 'video':
 536             self.add_extra_info(ie_result, extra_info)
 537             return self.process_video_result(ie_result, download=download)
 538         elif result_type == 'url':
 539             # We have to add extra_info to the results because it may be
 540             # contained in a playlist
 541             return self.extract_info(ie_result['url'],
 542                                      download,
 543                                      ie_key=ie_result.get('ie_key'),
 544                                      extra_info=extra_info)
 545         elif result_type == 'url_transparent':
 546             # Use the information from the embedding page
 547             info = self.extract_info(
 548                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 549                 extra_info=extra_info, download=False, process=False)
 550
 551             def make_result(embedded_info):
 552                 new_result = ie_result.copy()
 553                 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
 554                           'entries', 'ie_key', 'duration',
 555                           'subtitles', 'annotations', 'format',
 556                           'thumbnail', 'thumbnails'):
 557                     if f in new_result:
 558                         del new_result[f]
 559                     if f in embedded_info:
 560                         new_result[f] = embedded_info[f]
 561                 return new_result
 562             new_result = make_result(info)
 563
 564             assert new_result.get('_type') != 'url_transparent'
 565             if new_result.get('_type') == 'compat_list':
 566                 new_result['entries'] = [
 567                     make_result(e) for e in new_result['entries']]
 568
 569             return self.process_ie_result(
 570                 new_result, download=download, extra_info=extra_info)
 571         elif result_type == 'playlist':
 572             # We process each entry in the playlist
 573             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 574             self.to_screen('[download] Downloading playlist: %s' % playlist)
 575
 576             playlist_results = []
 577
 578             n_all_entries = len(ie_result['entries'])
 579             playliststart = self.params.get('playliststart', 1) - 1
 580             playlistend = self.params.get('playlistend', None)
 581             # For backwards compatibility, interpret -1 as whole list
 582             if playlistend == -1:
 583                 playlistend = None
 584
 585             entries = ie_result['entries'][playliststart:playlistend]
 586             n_entries = len(entries)
 587
 588             self.to_screen(
 589                 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 590                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
 591
 592             for i, entry in enumerate(entries, 1):
 593                 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
 594                 extra = {
 595                     'playlist': playlist,
 596                     'playlist_index': i + playliststart,
 597                     'extractor': ie_result['extractor'],
 598                     'webpage_url': ie_result['webpage_url'],
 599                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 600                     'extractor_key': ie_result['extractor_key'],
 601                 }
 602
 603                 reason = self._match_entry(entry)
 604                 if reason is not None:
 605                     self.to_screen('[download] ' + reason)
 606                     continue
 607
 608                 entry_result = self.process_ie_result(entry,
 609                                                       download=download,
 610                                                       extra_info=extra)
 611                 playlist_results.append(entry_result)
 612             ie_result['entries'] = playlist_results
 613             return ie_result
 614         elif result_type == 'compat_list':
 615             def _fixup(r):
 616                 self.add_extra_info(r,
 617                     {
 618                         'extractor': ie_result['extractor'],
 619                         'webpage_url': ie_result['webpage_url'],
 620                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 621                         'extractor_key': ie_result['extractor_key'],
 622                     })
 623                 return r
 624             ie_result['entries'] = [
 625                 self.process_ie_result(_fixup(r), download, extra_info)
 626                 for r in ie_result['entries']
 627             ]
 628             return ie_result
 629         else:
 630             raise Exception('Invalid result type: %s' % result_type)
 631
 632     def select_format(self, format_spec, available_formats):
 633         if format_spec == 'best' or format_spec is None:
 634             return available_formats[-1]
 635         elif format_spec == 'worst':
 636             return available_formats[0]
 637         else:
 638             extensions = ['mp4', 'flv', 'webm', '3gp']
 639             if format_spec in extensions:
 640                 filter_f = lambda f: f['ext'] == format_spec
 641             else:
 642                 filter_f = lambda f: f['format_id'] == format_spec
 643             matches = list(filter(filter_f, available_formats))
 644             if matches:
 645                 return matches[-1]
 646         return None
 647
 648     def process_video_result(self, info_dict, download=True):
 649         assert info_dict.get('_type', 'video') == 'video'
 650
 651         if 'playlist' not in info_dict:
 652             # It isn't part of a playlist
 653             info_dict['playlist'] = None
 654             info_dict['playlist_index'] = None
 655
 656         # This extractors handle format selection themselves
 657         if info_dict['extractor'] in ['Youku']:
 658             if download:
 659                 self.process_info(info_dict)
 660             return info_dict
 661
 662         # We now pick which formats have to be downloaded
 663         if info_dict.get('formats') is None:
 664             # There's only one format available
 665             formats = [info_dict]
 666         else:
 667             formats = info_dict['formats']
 668
 669         # We check that all the formats have the format and format_id fields
 670         for (i, format) in enumerate(formats):
 671             if format.get('format_id') is None:
 672                 format['format_id'] = compat_str(i)
 673             if format.get('format') is None:
 674                 format['format'] = '{id} - {res}{note}'.format(
 675                     id=format['format_id'],
 676                     res=self.format_resolution(format),
 677                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
 678                 )
 679             # Automatically determine file extension if missing
 680             if 'ext' not in format:
 681                 format['ext'] = determine_ext(format['url'])
 682
 683         format_limit = self.params.get('format_limit', None)
 684         if format_limit:
 685             formats = list(takewhile_inclusive(
 686                 lambda f: f['format_id'] != format_limit, formats
 687             ))
 688
 689         # TODO Central sorting goes here
 690
 691         if formats[0] is not info_dict:
 692             # only set the 'formats' fields if the original info_dict list them
 693             # otherwise we end up with a circular reference, the first (and unique)
 694             # element in the 'formats' field in info_dict is info_dict itself,
 695             # wich can't be exported to json
 696             info_dict['formats'] = formats
 697         if self.params.get('listformats', None):
 698             self.list_formats(info_dict)
 699             return
 700
 701         req_format = self.params.get('format', 'best')
 702         if req_format is None:
 703             req_format = 'best'
 704         formats_to_download = []
 705         # The -1 is for supporting YoutubeIE
 706         if req_format in ('-1', 'all'):
 707             formats_to_download = formats
 708         else:
 709             # We can accept formats requested in the format: 34/5/best, we pick
 710             # the first that is available, starting from left
 711             req_formats = req_format.split('/')
 712             for rf in req_formats:
 713                 if re.match(r'.+?\+.+?', rf) is not None:
 714                     # Two formats have been requested like '137+139'
 715                     format_1, format_2 = rf.split('+')
 716                     formats_info = (self.select_format(format_1, formats),
 717                         self.select_format(format_2, formats))
 718                     if all(formats_info):
 719                         selected_format = {
 720                             'requested_formats': formats_info,
 721                             'format': rf,
 722                             'ext': formats_info[0]['ext'],
 723                         }
 724                     else:
 725                         selected_format = None
 726                 else:
 727                     selected_format = self.select_format(rf, formats)
 728                 if selected_format is not None:
 729                     formats_to_download = [selected_format]
 730                     break
 731         if not formats_to_download:
 732             raise ExtractorError('requested format not available',
 733                                  expected=True)
 734
 735         if download:
 736             if len(formats_to_download) > 1:
 737                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
 738             for format in formats_to_download:
 739                 new_info = dict(info_dict)
 740                 new_info.update(format)
 741                 self.process_info(new_info)
 742         # We update the info dict with the best quality format (backwards compatibility)
 743         info_dict.update(formats_to_download[-1])
 744         return info_dict
 745
 746     def process_info(self, info_dict):
 747         """Process a single resolved IE result."""
 748
 749         assert info_dict.get('_type', 'video') == 'video'
 750         #We increment the download the download count here to match the previous behaviour.
 751         self.increment_downloads()
 752
 753         info_dict['fulltitle'] = info_dict['title']
 754         if len(info_dict['title']) > 200:
 755             info_dict['title'] = info_dict['title'][:197] + '...'
 756
 757         # Keep for backwards compatibility
 758         info_dict['stitle'] = info_dict['title']
 759
 760         if not 'format' in info_dict:
 761             info_dict['format'] = info_dict['ext']
 762
 763         reason = self._match_entry(info_dict)
 764         if reason is not None:
 765             self.to_screen('[download] ' + reason)
 766             return
 767
 768         max_downloads = self.params.get('max_downloads')
 769         if max_downloads is not None:
 770             if self._num_downloads > int(max_downloads):
 771                 raise MaxDownloadsReached()
 772
 773         filename = self.prepare_filename(info_dict)
 774
 775         # Forced printings
 776         if self.params.get('forcetitle', False):
 777             self.to_stdout(info_dict['fulltitle'])
 778         if self.params.get('forceid', False):
 779             self.to_stdout(info_dict['id'])
 780         if self.params.get('forceurl', False):
 781             # For RTMP URLs, also include the playpath
 782             self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
 783         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
 784             self.to_stdout(info_dict['thumbnail'])
 785         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
 786             self.to_stdout(info_dict['description'])
 787         if self.params.get('forcefilename', False) and filename is not None:
 788             self.to_stdout(filename)
 789         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
 790             self.to_stdout(formatSeconds(info_dict['duration']))
 791         if self.params.get('forceformat', False):
 792             self.to_stdout(info_dict['format'])
 793         if self.params.get('forcejson', False):
 794             info_dict['_filename'] = filename
 795             self.to_stdout(json.dumps(info_dict))
 796
 797         # Do nothing else if in simulate mode
 798         if self.params.get('simulate', False):
 799             return
 800
 801         if filename is None:
 802             return
 803
 804         try:
 805             dn = os.path.dirname(encodeFilename(filename))
 806             if dn != '' and not os.path.exists(dn):
 807                 os.makedirs(dn)
 808         except (OSError, IOError) as err:
 809             self.report_error('unable to create directory ' + compat_str(err))
 810             return
 811
 812         if self.params.get('writedescription', False):
 813             descfn = filename + '.description'
 814             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
 815                 self.to_screen('[info] Video description is already present')
 816             else:
 817                 try:
 818                     self.to_screen('[info] Writing video description to: ' + descfn)
 819                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 820                         descfile.write(info_dict['description'])
 821                 except (KeyError, TypeError):
 822                     self.report_warning('There\'s no description to write.')
 823                 except (OSError, IOError):
 824                     self.report_error('Cannot write description file ' + descfn)
 825                     return
 826
 827         if self.params.get('writeannotations', False):
 828             annofn = filename + '.annotations.xml'
 829             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
 830                 self.to_screen('[info] Video annotations are already present')
 831             else:
 832                 try:
 833                     self.to_screen('[info] Writing video annotations to: ' + annofn)
 834                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
 835                         annofile.write(info_dict['annotations'])
 836                 except (KeyError, TypeError):
 837                     self.report_warning('There are no annotations to write.')
 838                 except (OSError, IOError):
 839                     self.report_error('Cannot write annotations file: ' + annofn)
 840                     return
 841
 842         subtitles_are_requested = any([self.params.get('writesubtitles', False),
 843                                        self.params.get('writeautomaticsub')])
 844
 845         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 846             # subtitles download errors are already managed as troubles in relevant IE
 847             # that way it will silently go on when used with unsupporting IE
 848             subtitles = info_dict['subtitles']
 849             sub_format = self.params.get('subtitlesformat', 'srt')
 850             for sub_lang in subtitles.keys():
 851                 sub = subtitles[sub_lang]
 852                 if sub is None:
 853                     continue
 854                 try:
 855                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 856                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
 857                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
 858                     else:
 859                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
 860                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 861                                 subfile.write(sub)
 862                 except (OSError, IOError):
 863                     self.report_error('Cannot write subtitles file ' + descfn)
 864                     return
 865
 866         if self.params.get('writeinfojson', False):
 867             infofn = os.path.splitext(filename)[0] + '.info.json'
 868             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
 869                 self.to_screen('[info] Video description metadata is already present')
 870             else:
 871                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
 872                 try:
 873                     write_json_file(info_dict, encodeFilename(infofn))
 874                 except (OSError, IOError):
 875                     self.report_error('Cannot write metadata to JSON file ' + infofn)
 876                     return
 877
 878         if self.params.get('writethumbnail', False):
 879             if info_dict.get('thumbnail') is not None:
 880                 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
 881                 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
 882                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
 883                     self.to_screen('[%s] %s: Thumbnail is already present' %
 884                                    (info_dict['extractor'], info_dict['id']))
 885                 else:
 886                     self.to_screen('[%s] %s: Downloading thumbnail ...' %
 887                                    (info_dict['extractor'], info_dict['id']))
 888                     try:
 889                         uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
 890                         with open(thumb_filename, 'wb') as thumbf:
 891                             shutil.copyfileobj(uf, thumbf)
 892                         self.to_screen('[%s] %s: Writing thumbnail to: %s' %
 893                             (info_dict['extractor'], info_dict['id'], thumb_filename))
 894                     except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 895                         self.report_warning('Unable to download thumbnail "%s": %s' %
 896                             (info_dict['thumbnail'], compat_str(err)))
 897
 898         if not self.params.get('skip_download', False):
 899             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
 900                 success = True
 901             else:
 902                 try:
 903                     def dl(name, info):
 904                         fd = get_suitable_downloader(info)(self, self.params)
 905                         for ph in self._progress_hooks:
 906                             fd.add_progress_hook(ph)
 907                         return fd.download(name, info)
 908                     if info_dict.get('requested_formats') is not None:
 909                         downloaded = []
 910                         success = True
 911                         merger = FFmpegMergerPP(self)
 912                         if not merger._get_executable():
 913                             postprocessors = []
 914                             self.report_warning('You have requested multiple '
 915                                 'formats but ffmpeg or avconv are not installed.'
 916                                 ' The formats won\'t be merged')
 917                         else:
 918                             postprocessors = [merger]
 919                         for f in info_dict['requested_formats']:
 920                             new_info = dict(info_dict)
 921                             new_info.update(f)
 922                             fname = self.prepare_filename(new_info)
 923                             fname = prepend_extension(fname, 'f%s' % f['format_id'])
 924                             downloaded.append(fname)
 925                             partial_success = dl(fname, new_info)
 926                             success = success and partial_success
 927                         info_dict['__postprocessors'] = postprocessors
 928                         info_dict['__files_to_merge'] = downloaded
 929                     else:
 930                         # Just a single file
 931                         success = dl(filename, info_dict)
 932                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 933                     self.report_error('unable to download video data: %s' % str(err))
 934                     return
 935                 except (OSError, IOError) as err:
 936                     raise UnavailableVideoError(err)
 937                 except (ContentTooShortError, ) as err:
 938                     self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 939                     return
 940
 941             if success:
 942                 try:
 943                     self.post_process(filename, info_dict)
 944                 except (PostProcessingError) as err:
 945                     self.report_error('postprocessing: %s' % str(err))
 946                     return
 947
 948         self.record_download_archive(info_dict)
 949
 950     def download(self, url_list):
 951         """Download a given list of URLs."""
 952         if (len(url_list) > 1 and
 953                 '%' not in self.params['outtmpl']
 954                 and self.params.get('max_downloads') != 1):
 955             raise SameFileError(self.params['outtmpl'])
 956
 957         for url in url_list:
 958             try:
 959                 #It also downloads the videos
 960                 self.extract_info(url)
 961             except UnavailableVideoError:
 962                 self.report_error('unable to download video')
 963             except MaxDownloadsReached:
 964                 self.to_screen('[info] Maximum number of downloaded files reached.')
 965                 raise
 966
 967         return self._download_retcode
 968
 969     def download_with_info_file(self, info_filename):
 970         with io.open(info_filename, 'r', encoding='utf-8') as f:
 971             info = json.load(f)
 972         try:
 973             self.process_ie_result(info, download=True)
 974         except DownloadError:
 975             webpage_url = info.get('webpage_url')
 976             if webpage_url is not None:
 977                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
 978                 return self.download([webpage_url])
 979             else:
 980                 raise
 981         return self._download_retcode
 982
 983     def post_process(self, filename, ie_info):
 984         """Run all the postprocessors on the given file."""
 985         info = dict(ie_info)
 986         info['filepath'] = filename
 987         keep_video = None
 988         pps_chain = []
 989         if ie_info.get('__postprocessors') is not None:
 990             pps_chain.extend(ie_info['__postprocessors'])
 991         pps_chain.extend(self._pps)
 992         for pp in pps_chain:
 993             try:
 994                 keep_video_wish, new_info = pp.run(info)
 995                 if keep_video_wish is not None:
 996                     if keep_video_wish:
 997                         keep_video = keep_video_wish
 998                     elif keep_video is None:
 999                         # No clear decision yet, let IE decide
1000                         keep_video = keep_video_wish
1001             except PostProcessingError as e:
1002                 self.report_error(e.msg)
1003         if keep_video is False and not self.params.get('keepvideo', False):
1004             try:
1005                 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1006                 os.remove(encodeFilename(filename))
1007             except (IOError, OSError):
1008                 self.report_warning('Unable to remove downloaded video file')
1009
1010     def _make_archive_id(self, info_dict):
1011         # Future-proof against any change in case
1012         # and backwards compatibility with prior versions
1013         extractor = info_dict.get('extractor_key')
1014         if extractor is None:
1015             if 'id' in info_dict:
1016                 extractor = info_dict.get('ie_key')  # key in a playlist
1017         if extractor is None:
1018             return None  # Incomplete video information
1019         return extractor.lower() + ' ' + info_dict['id']
1020
1021     def in_download_archive(self, info_dict):
1022         fn = self.params.get('download_archive')
1023         if fn is None:
1024             return False
1025
1026         vid_id = self._make_archive_id(info_dict)
1027         if vid_id is None:
1028             return False  # Incomplete video information
1029
1030         try:
1031             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1032                 for line in archive_file:
1033                     if line.strip() == vid_id:
1034                         return True
1035         except IOError as ioe:
1036             if ioe.errno != errno.ENOENT:
1037                 raise
1038         return False
1039
1040     def record_download_archive(self, info_dict):
1041         fn = self.params.get('download_archive')
1042         if fn is None:
1043             return
1044         vid_id = self._make_archive_id(info_dict)
1045         assert vid_id
1046         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1047             archive_file.write(vid_id + '\n')
1048
1049     @staticmethod
1050     def format_resolution(format, default='unknown'):
1051         if format.get('vcodec') == 'none':
1052             return 'audio only'
1053         if format.get('resolution') is not None:
1054             return format['resolution']
1055         if format.get('height') is not None:
1056             if format.get('width') is not None:
1057                 res = '%sx%s' % (format['width'], format['height'])
1058             else:
1059                 res = '%sp' % format['height']
1060         elif format.get('width') is not None:
1061             res = '?x%d' % format['width']
1062         else:
1063             res = default
1064         return res
1065
1066     def list_formats(self, info_dict):
1067         def format_note(fdict):
1068             res = ''
1069             if fdict.get('ext') in ['f4f', 'f4m']:
1070                 res += '(unsupported) '
1071             if fdict.get('format_note') is not None:
1072                 res += fdict['format_note'] + ' '
1073             if fdict.get('tbr') is not None:
1074                 res += '%4dk ' % fdict['tbr']
1075             if (fdict.get('vcodec') is not None and
1076                     fdict.get('vcodec') != 'none'):
1077                 res += '%-5s' % fdict['vcodec']
1078                 if fdict.get('vbr') is not None:
1079                     res += '@'
1080             elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1081                 res += 'video@'
1082             if fdict.get('vbr') is not None:
1083                 res += '%4dk' % fdict['vbr']
1084             if fdict.get('acodec') is not None:
1085                 if res:
1086                     res += ', '
1087                 res += '%-5s' % fdict['acodec']
1088             elif fdict.get('abr') is not None:
1089                 if res:
1090                     res += ', '
1091                 res += 'audio'
1092             if fdict.get('abr') is not None:
1093                 res += '@%3dk' % fdict['abr']
1094             if fdict.get('asr') is not None:
1095                 res += ' (%5dHz)' % fdict['asr']
1096             if fdict.get('filesize') is not None:
1097                 if res:
1098                     res += ', '
1099                 res += format_bytes(fdict['filesize'])
1100             return res
1101
1102         def line(format, idlen=20):
1103             return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1104                 format['format_id'],
1105                 format['ext'],
1106                 self.format_resolution(format),
1107                 format_note(format),
1108             ))
1109
1110         formats = info_dict.get('formats', [info_dict])
1111         idlen = max(len('format code'),
1112                     max(len(f['format_id']) for f in formats))
1113         formats_s = [line(f, idlen) for f in formats]
1114         if len(formats) > 1:
1115             formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1116             formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1117
1118         header_line = line({
1119             'format_id': 'format code', 'ext': 'extension',
1120             'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1121         self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1122                        (info_dict['id'], header_line, '\n'.join(formats_s)))
1123
1124     def urlopen(self, req):
1125         """ Start an HTTP download """
1126         return self._opener.open(req)
1127
1128     def print_debug_header(self):
1129         if not self.params.get('verbose'):
1130             return
1131         write_string('[debug] youtube-dl version ' + __version__ + '\n')
1132         try:
1133             sp = subprocess.Popen(
1134                 ['git', 'rev-parse', '--short', 'HEAD'],
1135                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1136                 cwd=os.path.dirname(os.path.abspath(__file__)))
1137             out, err = sp.communicate()
1138             out = out.decode().strip()
1139             if re.match('[0-9a-f]+', out):
1140                 write_string('[debug] Git HEAD: ' + out + '\n')
1141         except:
1142             try:
1143                 sys.exc_clear()
1144             except:
1145                 pass
1146         write_string('[debug] Python version %s - %s' %
1147                      (platform.python_version(), platform_name()) + '\n')
1148
1149         proxy_map = {}
1150         for handler in self._opener.handlers:
1151             if hasattr(handler, 'proxies'):
1152                 proxy_map.update(handler.proxies)
1153         write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1154
1155     def _setup_opener(self):
1156         timeout_val = self.params.get('socket_timeout')
1157         timeout = 600 if timeout_val is None else float(timeout_val)
1158
1159         opts_cookiefile = self.params.get('cookiefile')
1160         opts_proxy = self.params.get('proxy')
1161
1162         if opts_cookiefile is None:
1163             self.cookiejar = compat_cookiejar.CookieJar()
1164         else:
1165             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1166                 opts_cookiefile)
1167             if os.access(opts_cookiefile, os.R_OK):
1168                 self.cookiejar.load()
1169
1170         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1171             self.cookiejar)
1172         if opts_proxy is not None:
1173             if opts_proxy == '':
1174                 proxies = {}
1175             else:
1176                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1177         else:
1178             proxies = compat_urllib_request.getproxies()
1179             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1180             if 'http' in proxies and 'https' not in proxies:
1181                 proxies['https'] = proxies['http']
1182         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1183
1184         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1185         https_handler = make_HTTPS_handler(
1186             self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1187         ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1188         opener = compat_urllib_request.build_opener(
1189             https_handler, proxy_handler, cookie_processor, ydlh)
1190         # Delete the default user-agent header, which would otherwise apply in
1191         # cases where our custom HTTP handler doesn't come into play
1192         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1193         opener.addheaders = []
1194         self._opener = opener
1195
1196         # TODO remove this global modification
1197         compat_urllib_request.install_opener(opener)
1198         socket.setdefaulttimeout(timeout)