youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import errno
   8 import io
   9 import json
  10 import os
  11 import platform
  12 import re
  13 import shutil
  14 import subprocess
  15 import socket
  16 import sys
  17 import time
  18 import traceback
  19
  20 if os.name == 'nt':
  21     import ctypes
  22
  23 from .utils import (
  24     compat_cookiejar,
  25     compat_http_client,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_request,
  29     ContentTooShortError,
  30     date_from_str,
  31     DateRange,
  32     determine_ext,
  33     DownloadError,
  34     encodeFilename,
  35     ExtractorError,
  36     format_bytes,
  37     formatSeconds,
  38     get_term_width,
  39     locked_file,
  40     make_HTTPS_handler,
  41     MaxDownloadsReached,
  42     PostProcessingError,
  43     platform_name,
  44     preferredencoding,
  45     SameFileError,
  46     sanitize_filename,
  47     subtitles_filename,
  48     takewhile_inclusive,
  49     UnavailableVideoError,
  50     url_basename,
  51     write_json_file,
  52     write_string,
  53     YoutubeDLHandler,
  54     prepend_extension,
  55 )
  56 from .extractor import get_info_extractor, gen_extractors
  57 from .downloader import get_suitable_downloader
  58 from .postprocessor import FFmpegMergerPP
  59 from .version import __version__
  60
  61
  62 class YoutubeDL(object):
  63     """YoutubeDL class.
  64
  65     YoutubeDL objects are the ones responsible of downloading the
  66     actual video file and writing it to disk if the user has requested
  67     it, among some other tasks. In most cases there should be one per
  68     program. As, given a video URL, the downloader doesn't know how to
  69     extract all the needed information, task that InfoExtractors do, it
  70     has to pass the URL to one of them.
  71
  72     For this, YoutubeDL objects have a method that allows
  73     InfoExtractors to be registered in a given order. When it is passed
  74     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  75     finds that reports being able to handle it. The InfoExtractor extracts
  76     all the information about the video or videos the URL refers to, and
  77     YoutubeDL process the extracted information, possibly using a File
  78     Downloader to download the video.
  79
  80     YoutubeDL objects accept a lot of parameters. In order not to saturate
  81     the object constructor with arguments, it receives a dictionary of
  82     options instead. These options are available through the params
  83     attribute for the InfoExtractors to use. The YoutubeDL also
  84     registers itself as the downloader in charge for the InfoExtractors
  85     that are added to it, so this is a "mutual registration".
  86
  87     Available options:
  88
  89     username:          Username for authentication purposes.
  90     password:          Password for authentication purposes.
  91     videopassword:     Password for acces a video.
  92     usenetrc:          Use netrc for authentication instead.
  93     verbose:           Print additional info to stdout.
  94     quiet:             Do not print messages to stdout.
  95     forceurl:          Force printing final URL.
  96     forcetitle:        Force printing title.
  97     forceid:           Force printing ID.
  98     forcethumbnail:    Force printing thumbnail URL.
  99     forcedescription:  Force printing description.
 100     forcefilename:     Force printing final filename.
 101     forceduration:     Force printing duration.
 102     forcejson:         Force printing info_dict as JSON.
 103     simulate:          Do not download the video files.
 104     format:            Video format code.
 105     format_limit:      Highest quality format to try.
 106     outtmpl:           Template for output names.
 107     restrictfilenames: Do not allow "&" and spaces in file names
 108     ignoreerrors:      Do not stop on download errors.
 109     nooverwrites:      Prevent overwriting files.
 110     playliststart:     Playlist item to start at.
 111     playlistend:       Playlist item to end at.
 112     matchtitle:        Download only matching titles.
 113     rejecttitle:       Reject downloads for matching titles.
 114     logger:            Log messages to a logging.Logger instance.
 115     logtostderr:       Log messages to stderr instead of stdout.
 116     writedescription:  Write the video description to a .description file
 117     writeinfojson:     Write the video description to a .info.json file
 118     writeannotations:  Write the video annotations to a .annotations.xml file
 119     writethumbnail:    Write the thumbnail image to a file
 120     writesubtitles:    Write the video subtitles to a file
 121     writeautomaticsub: Write the automatic subtitles to a file
 122     allsubtitles:      Downloads all the subtitles of the video
 123                        (requires writesubtitles or writeautomaticsub)
 124     listsubtitles:     Lists all available subtitles for the video
 125     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 126     subtitleslangs:    List of languages of the subtitles to download
 127     keepvideo:         Keep the video file after post-processing
 128     daterange:         A DateRange object, download only if the upload_date is in the range.
 129     skip_download:     Skip the actual download of the video file
 130     cachedir:          Location of the cache files in the filesystem.
 131                        None to disable filesystem cache.
 132     noplaylist:        Download single video instead of a playlist if in doubt.
 133     age_limit:         An integer representing the user's age in years.
 134                        Unsuitable videos for the given age are skipped.
 135     min_views:         An integer representing the minimum view count the video
 136                        must have in order to not be skipped.
 137                        Videos without view count information are always
 138                        downloaded. None for no limit.
 139     max_views:         An integer representing the maximum view count.
 140                        Videos that are more popular than that are not
 141                        downloaded.
 142                        Videos without view count information are always
 143                        downloaded. None for no limit.
 144     download_archive:  File name of a file where all downloads are recorded.
 145                        Videos already present in the file are not downloaded
 146                        again.
 147     cookiefile:        File name where cookies should be read from and dumped to.
 148     nocheckcertificate:Do not verify SSL certificates
 149     proxy:             URL of the proxy server to use
 150     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 151     bidi_workaround:   Work around buggy terminals without bidirectional text
 152                        support, using fridibi
 153     debug_printtraffic:Print out sent and received HTTP traffic
 154     include_ads:       Download ads as well
 155
 156     The following parameters are not used by YoutubeDL itself, they are used by
 157     the FileDownloader:
 158     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 159     noresizebuffer, retries, continuedl, noprogress, consoletitle
 160
 161     The following options are used by the post processors:
 162     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 163                        otherwise prefer avconv.
 164     """
 165
 166     params = None
 167     _ies = []
 168     _pps = []
 169     _download_retcode = None
 170     _num_downloads = None
 171     _screen_file = None
 172
 173     def __init__(self, params=None):
 174         """Create a FileDownloader object with the given options."""
 175         if params is None:
 176             params = {}
 177         self._ies = []
 178         self._ies_instances = {}
 179         self._pps = []
 180         self._progress_hooks = []
 181         self._download_retcode = 0
 182         self._num_downloads = 0
 183         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 184         self._err_file = sys.stderr
 185         self.params = params
 186
 187         if params.get('bidi_workaround', False):
 188             try:
 189                 import pty
 190                 master, slave = pty.openpty()
 191                 width = get_term_width()
 192                 if width is None:
 193                     width_args = []
 194                 else:
 195                     width_args = ['-w', str(width)]
 196                 sp_kwargs = dict(
 197                     stdin=subprocess.PIPE,
 198                     stdout=slave,
 199                     stderr=self._err_file)
 200                 try:
 201                     self._output_process = subprocess.Popen(
 202                         ['bidiv'] + width_args, **sp_kwargs
 203                     )
 204                 except OSError:
 205                     self._output_process = subprocess.Popen(
 206                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 207                 self._output_channel = os.fdopen(master, 'rb')
 208             except OSError as ose:
 209                 if ose.errno == 2:
 210                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 211                 else:
 212                     raise
 213
 214         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 215                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 216                 and not params['restrictfilenames']):
 217             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 218             self.report_warning(
 219                 'Assuming --restrict-filenames since file system encoding '
 220                 'cannot encode all charactes. '
 221                 'Set the LC_ALL environment variable to fix this.')
 222             self.params['restrictfilenames'] = True
 223
 224         if '%(stitle)s' in self.params.get('outtmpl', ''):
 225             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 226
 227         self._setup_opener()
 228
 229     def add_info_extractor(self, ie):
 230         """Add an InfoExtractor object to the end of the list."""
 231         self._ies.append(ie)
 232         self._ies_instances[ie.ie_key()] = ie
 233         ie.set_downloader(self)
 234
 235     def get_info_extractor(self, ie_key):
 236         """
 237         Get an instance of an IE with name ie_key, it will try to get one from
 238         the _ies list, if there's no instance it will create a new one and add
 239         it to the extractor list.
 240         """
 241         ie = self._ies_instances.get(ie_key)
 242         if ie is None:
 243             ie = get_info_extractor(ie_key)()
 244             self.add_info_extractor(ie)
 245         return ie
 246
 247     def add_default_info_extractors(self):
 248         """
 249         Add the InfoExtractors returned by gen_extractors to the end of the list
 250         """
 251         for ie in gen_extractors():
 252             self.add_info_extractor(ie)
 253
 254     def add_post_processor(self, pp):
 255         """Add a PostProcessor object to the end of the chain."""
 256         self._pps.append(pp)
 257         pp.set_downloader(self)
 258
 259     def add_progress_hook(self, ph):
 260         """Add the progress hook (currently only for the file downloader)"""
 261         self._progress_hooks.append(ph)
 262
 263     def _bidi_workaround(self, message):
 264         if not hasattr(self, '_output_channel'):
 265             return message
 266
 267         assert hasattr(self, '_output_process')
 268         assert type(message) == type('')
 269         line_count = message.count('\n') + 1
 270         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 271         self._output_process.stdin.flush()
 272         res = ''.join(self._output_channel.readline().decode('utf-8')
 273                        for _ in range(line_count))
 274         return res[:-len('\n')]
 275
 276     def to_screen(self, message, skip_eol=False):
 277         """Print message to stdout if not in quiet mode."""
 278         return self.to_stdout(message, skip_eol, check_quiet=True)
 279
 280     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 281         """Print message to stdout if not in quiet mode."""
 282         if self.params.get('logger'):
 283             self.params['logger'].debug(message)
 284         elif not check_quiet or not self.params.get('quiet', False):
 285             message = self._bidi_workaround(message)
 286             terminator = ['\n', ''][skip_eol]
 287             output = message + terminator
 288
 289             write_string(output, self._screen_file)
 290
 291     def to_stderr(self, message):
 292         """Print message to stderr."""
 293         assert type(message) == type('')
 294         if self.params.get('logger'):
 295             self.params['logger'].error(message)
 296         else:
 297             message = self._bidi_workaround(message)
 298             output = message + '\n'
 299             write_string(output, self._err_file)
 300
 301     def to_console_title(self, message):
 302         if not self.params.get('consoletitle', False):
 303             return
 304         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 305             # c_wchar_p() might not be necessary if `message` is
 306             # already of type unicode()
 307             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 308         elif 'TERM' in os.environ:
 309             write_string('\033]0;%s\007' % message, self._screen_file)
 310
 311     def save_console_title(self):
 312         if not self.params.get('consoletitle', False):
 313             return
 314         if 'TERM' in os.environ:
 315             # Save the title on stack
 316             write_string('\033[22;0t', self._screen_file)
 317
 318     def restore_console_title(self):
 319         if not self.params.get('consoletitle', False):
 320             return
 321         if 'TERM' in os.environ:
 322             # Restore the title from stack
 323             write_string('\033[23;0t', self._screen_file)
 324
 325     def __enter__(self):
 326         self.save_console_title()
 327         return self
 328
 329     def __exit__(self, *args):
 330         self.restore_console_title()
 331
 332         if self.params.get('cookiefile') is not None:
 333             self.cookiejar.save()
 334
 335     def trouble(self, message=None, tb=None):
 336         """Determine action to take when a download problem appears.
 337
 338         Depending on if the downloader has been configured to ignore
 339         download errors or not, this method may throw an exception or
 340         not when errors are found, after printing the message.
 341
 342         tb, if given, is additional traceback information.
 343         """
 344         if message is not None:
 345             self.to_stderr(message)
 346         if self.params.get('verbose'):
 347             if tb is None:
 348                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 349                     tb = ''
 350                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 351                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 352                     tb += compat_str(traceback.format_exc())
 353                 else:
 354                     tb_data = traceback.format_list(traceback.extract_stack())
 355                     tb = ''.join(tb_data)
 356             self.to_stderr(tb)
 357         if not self.params.get('ignoreerrors', False):
 358             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 359                 exc_info = sys.exc_info()[1].exc_info
 360             else:
 361                 exc_info = sys.exc_info()
 362             raise DownloadError(message, exc_info)
 363         self._download_retcode = 1
 364
 365     def report_warning(self, message):
 366         '''
 367         Print the message to stderr, it will be prefixed with 'WARNING:'
 368         If stderr is a tty file the 'WARNING:' will be colored
 369         '''
 370         if self._err_file.isatty() and os.name != 'nt':
 371             _msg_header = '\033[0;33mWARNING:\033[0m'
 372         else:
 373             _msg_header = 'WARNING:'
 374         warning_message = '%s %s' % (_msg_header, message)
 375         self.to_stderr(warning_message)
 376
 377     def report_error(self, message, tb=None):
 378         '''
 379         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 380         in red if stderr is a tty file.
 381         '''
 382         if self._err_file.isatty() and os.name != 'nt':
 383             _msg_header = '\033[0;31mERROR:\033[0m'
 384         else:
 385             _msg_header = 'ERROR:'
 386         error_message = '%s %s' % (_msg_header, message)
 387         self.trouble(error_message, tb)
 388
 389     def report_file_already_downloaded(self, file_name):
 390         """Report file has already been fully downloaded."""
 391         try:
 392             self.to_screen('[download] %s has already been downloaded' % file_name)
 393         except UnicodeEncodeError:
 394             self.to_screen('[download] The file has already been downloaded')
 395
 396     def increment_downloads(self):
 397         """Increment the ordinal that assigns a number to each file."""
 398         self._num_downloads += 1
 399
 400     def prepare_filename(self, info_dict):
 401         """Generate the output filename."""
 402         try:
 403             template_dict = dict(info_dict)
 404
 405             template_dict['epoch'] = int(time.time())
 406             autonumber_size = self.params.get('autonumber_size')
 407             if autonumber_size is None:
 408                 autonumber_size = 5
 409             autonumber_templ = '%0' + str(autonumber_size) + 'd'
 410             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 411             if template_dict.get('playlist_index') is not None:
 412                 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
 413
 414             sanitize = lambda k, v: sanitize_filename(
 415                 compat_str(v),
 416                 restricted=self.params.get('restrictfilenames'),
 417                 is_id=(k == 'id'))
 418             template_dict = dict((k, sanitize(k, v))
 419                                  for k, v in template_dict.items()
 420                                  if v is not None)
 421             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 422
 423             tmpl = os.path.expanduser(self.params['outtmpl'])
 424             filename = tmpl % template_dict
 425             return filename
 426         except ValueError as err:
 427             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 428             return None
 429
 430     def _match_entry(self, info_dict):
 431         """ Returns None iff the file should be downloaded """
 432
 433         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 434         if 'title' in info_dict:
 435             # This can happen when we're just evaluating the playlist
 436             title = info_dict['title']
 437             matchtitle = self.params.get('matchtitle', False)
 438             if matchtitle:
 439                 if not re.search(matchtitle, title, re.IGNORECASE):
 440                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 441             rejecttitle = self.params.get('rejecttitle', False)
 442             if rejecttitle:
 443                 if re.search(rejecttitle, title, re.IGNORECASE):
 444                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 445         date = info_dict.get('upload_date', None)
 446         if date is not None:
 447             dateRange = self.params.get('daterange', DateRange())
 448             if date not in dateRange:
 449                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 450         view_count = info_dict.get('view_count', None)
 451         if view_count is not None:
 452             min_views = self.params.get('min_views')
 453             if min_views is not None and view_count < min_views:
 454                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 455             max_views = self.params.get('max_views')
 456             if max_views is not None and view_count > max_views:
 457                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 458         age_limit = self.params.get('age_limit')
 459         if age_limit is not None:
 460             if age_limit < info_dict.get('age_limit', 0):
 461                 return 'Skipping "' + title + '" because it is age restricted'
 462         if self.in_download_archive(info_dict):
 463             return '%s has already been recorded in archive' % video_title
 464         return None
 465
 466     @staticmethod
 467     def add_extra_info(info_dict, extra_info):
 468         '''Set the keys from extra_info in info dict if they are missing'''
 469         for key, value in extra_info.items():
 470             info_dict.setdefault(key, value)
 471
 472     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 473                      process=True):
 474         '''
 475         Returns a list with a dictionary for each video we find.
 476         If 'download', also downloads the videos.
 477         extra_info is a dict containing the extra values to add to each result
 478          '''
 479
 480         if ie_key:
 481             ies = [self.get_info_extractor(ie_key)]
 482         else:
 483             ies = self._ies
 484
 485         for ie in ies:
 486             if not ie.suitable(url):
 487                 continue
 488
 489             if not ie.working():
 490                 self.report_warning('The program functionality for this site has been marked as broken, '
 491                                     'and will probably not work.')
 492
 493             try:
 494                 ie_result = ie.extract(url)
 495                 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
 496                     break
 497                 if isinstance(ie_result, list):
 498                     # Backwards compatibility: old IE result format
 499                     ie_result = {
 500                         '_type': 'compat_list',
 501                         'entries': ie_result,
 502                     }
 503                 self.add_extra_info(ie_result,
 504                     {
 505                         'extractor': ie.IE_NAME,
 506                         'webpage_url': url,
 507                         'webpage_url_basename': url_basename(url),
 508                         'extractor_key': ie.ie_key(),
 509                     })
 510                 if process:
 511                     return self.process_ie_result(ie_result, download, extra_info)
 512                 else:
 513                     return ie_result
 514             except ExtractorError as de: # An error we somewhat expected
 515                 self.report_error(compat_str(de), de.format_traceback())
 516                 break
 517             except Exception as e:
 518                 if self.params.get('ignoreerrors', False):
 519                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 520                     break
 521                 else:
 522                     raise
 523         else:
 524             self.report_error('no suitable InfoExtractor: %s' % url)
 525
 526     def process_ie_result(self, ie_result, download=True, extra_info={}):
 527         """
 528         Take the result of the ie(may be modified) and resolve all unresolved
 529         references (URLs, playlist items).
 530
 531         It will also download the videos if 'download'.
 532         Returns the resolved ie_result.
 533         """
 534
 535         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
 536         if result_type == 'video':
 537             self.add_extra_info(ie_result, extra_info)
 538             return self.process_video_result(ie_result, download=download)
 539         elif result_type == 'url':
 540             # We have to add extra_info to the results because it may be
 541             # contained in a playlist
 542             return self.extract_info(ie_result['url'],
 543                                      download,
 544                                      ie_key=ie_result.get('ie_key'),
 545                                      extra_info=extra_info)
 546         elif result_type == 'url_transparent':
 547             # Use the information from the embedding page
 548             info = self.extract_info(
 549                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 550                 extra_info=extra_info, download=False, process=False)
 551
 552             def make_result(embedded_info):
 553                 new_result = ie_result.copy()
 554                 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
 555                           'entries', 'ie_key', 'duration',
 556                           'subtitles', 'annotations', 'format',
 557                           'thumbnail', 'thumbnails'):
 558                     if f in new_result:
 559                         del new_result[f]
 560                     if f in embedded_info:
 561                         new_result[f] = embedded_info[f]
 562                 return new_result
 563             new_result = make_result(info)
 564
 565             assert new_result.get('_type') != 'url_transparent'
 566             if new_result.get('_type') == 'compat_list':
 567                 new_result['entries'] = [
 568                     make_result(e) for e in new_result['entries']]
 569
 570             return self.process_ie_result(
 571                 new_result, download=download, extra_info=extra_info)
 572         elif result_type == 'playlist':
 573             # We process each entry in the playlist
 574             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 575             self.to_screen('[download] Downloading playlist: %s' % playlist)
 576
 577             playlist_results = []
 578
 579             n_all_entries = len(ie_result['entries'])
 580             playliststart = self.params.get('playliststart', 1) - 1
 581             playlistend = self.params.get('playlistend', None)
 582             # For backwards compatibility, interpret -1 as whole list
 583             if playlistend == -1:
 584                 playlistend = None
 585
 586             entries = ie_result['entries'][playliststart:playlistend]
 587             n_entries = len(entries)
 588
 589             self.to_screen(
 590                 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 591                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
 592
 593             for i, entry in enumerate(entries, 1):
 594                 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
 595                 extra = {
 596                     'playlist': playlist,
 597                     'playlist_index': i + playliststart,
 598                     'extractor': ie_result['extractor'],
 599                     'webpage_url': ie_result['webpage_url'],
 600                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 601                     'extractor_key': ie_result['extractor_key'],
 602                 }
 603
 604                 reason = self._match_entry(entry)
 605                 if reason is not None:
 606                     self.to_screen('[download] ' + reason)
 607                     continue
 608
 609                 entry_result = self.process_ie_result(entry,
 610                                                       download=download,
 611                                                       extra_info=extra)
 612                 playlist_results.append(entry_result)
 613             ie_result['entries'] = playlist_results
 614             return ie_result
 615         elif result_type == 'compat_list':
 616             def _fixup(r):
 617                 self.add_extra_info(r,
 618                     {
 619                         'extractor': ie_result['extractor'],
 620                         'webpage_url': ie_result['webpage_url'],
 621                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 622                         'extractor_key': ie_result['extractor_key'],
 623                     })
 624                 return r
 625             ie_result['entries'] = [
 626                 self.process_ie_result(_fixup(r), download, extra_info)
 627                 for r in ie_result['entries']
 628             ]
 629             return ie_result
 630         else:
 631             raise Exception('Invalid result type: %s' % result_type)
 632
 633     def select_format(self, format_spec, available_formats):
 634         if format_spec == 'best' or format_spec is None:
 635             return available_formats[-1]
 636         elif format_spec == 'worst':
 637             return available_formats[0]
 638         else:
 639             extensions = ['mp4', 'flv', 'webm', '3gp']
 640             if format_spec in extensions:
 641                 filter_f = lambda f: f['ext'] == format_spec
 642             else:
 643                 filter_f = lambda f: f['format_id'] == format_spec
 644             matches = list(filter(filter_f, available_formats))
 645             if matches:
 646                 return matches[-1]
 647         return None
 648
 649     def process_video_result(self, info_dict, download=True):
 650         assert info_dict.get('_type', 'video') == 'video'
 651
 652         if 'playlist' not in info_dict:
 653             # It isn't part of a playlist
 654             info_dict['playlist'] = None
 655             info_dict['playlist_index'] = None
 656
 657         # This extractors handle format selection themselves
 658         if info_dict['extractor'] in ['Youku']:
 659             if download:
 660                 self.process_info(info_dict)
 661             return info_dict
 662
 663         # We now pick which formats have to be downloaded
 664         if info_dict.get('formats') is None:
 665             # There's only one format available
 666             formats = [info_dict]
 667         else:
 668             formats = info_dict['formats']
 669
 670         # We check that all the formats have the format and format_id fields
 671         for (i, format) in enumerate(formats):
 672             if format.get('format_id') is None:
 673                 format['format_id'] = compat_str(i)
 674             if format.get('format') is None:
 675                 format['format'] = '{id} - {res}{note}'.format(
 676                     id=format['format_id'],
 677                     res=self.format_resolution(format),
 678                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
 679                 )
 680             # Automatically determine file extension if missing
 681             if 'ext' not in format:
 682                 format['ext'] = determine_ext(format['url'])
 683
 684         format_limit = self.params.get('format_limit', None)
 685         if format_limit:
 686             formats = list(takewhile_inclusive(
 687                 lambda f: f['format_id'] != format_limit, formats
 688             ))
 689
 690         # TODO Central sorting goes here
 691
 692         if formats[0] is not info_dict:
 693             # only set the 'formats' fields if the original info_dict list them
 694             # otherwise we end up with a circular reference, the first (and unique)
 695             # element in the 'formats' field in info_dict is info_dict itself,
 696             # wich can't be exported to json
 697             info_dict['formats'] = formats
 698         if self.params.get('listformats', None):
 699             self.list_formats(info_dict)
 700             return
 701
 702         req_format = self.params.get('format', 'best')
 703         if req_format is None:
 704             req_format = 'best'
 705         formats_to_download = []
 706         # The -1 is for supporting YoutubeIE
 707         if req_format in ('-1', 'all'):
 708             formats_to_download = formats
 709         else:
 710             # We can accept formats requested in the format: 34/5/best, we pick
 711             # the first that is available, starting from left
 712             req_formats = req_format.split('/')
 713             for rf in req_formats:
 714                 if re.match(r'.+?\+.+?', rf) is not None:
 715                     # Two formats have been requested like '137+139'
 716                     format_1, format_2 = rf.split('+')
 717                     formats_info = (self.select_format(format_1, formats),
 718                         self.select_format(format_2, formats))
 719                     if all(formats_info):
 720                         selected_format = {
 721                             'requested_formats': formats_info,
 722                             'format': rf,
 723                             'ext': formats_info[0]['ext'],
 724                         }
 725                     else:
 726                         selected_format = None
 727                 else:
 728                     selected_format = self.select_format(rf, formats)
 729                 if selected_format is not None:
 730                     formats_to_download = [selected_format]
 731                     break
 732         if not formats_to_download:
 733             raise ExtractorError('requested format not available',
 734                                  expected=True)
 735
 736         if download:
 737             if len(formats_to_download) > 1:
 738                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
 739             for format in formats_to_download:
 740                 new_info = dict(info_dict)
 741                 new_info.update(format)
 742                 self.process_info(new_info)
 743         # We update the info dict with the best quality format (backwards compatibility)
 744         info_dict.update(formats_to_download[-1])
 745         return info_dict
 746
 747     def process_info(self, info_dict):
 748         """Process a single resolved IE result."""
 749
 750         assert info_dict.get('_type', 'video') == 'video'
 751         #We increment the download the download count here to match the previous behaviour.
 752         self.increment_downloads()
 753
 754         info_dict['fulltitle'] = info_dict['title']
 755         if len(info_dict['title']) > 200:
 756             info_dict['title'] = info_dict['title'][:197] + '...'
 757
 758         # Keep for backwards compatibility
 759         info_dict['stitle'] = info_dict['title']
 760
 761         if not 'format' in info_dict:
 762             info_dict['format'] = info_dict['ext']
 763
 764         reason = self._match_entry(info_dict)
 765         if reason is not None:
 766             self.to_screen('[download] ' + reason)
 767             return
 768
 769         max_downloads = self.params.get('max_downloads')
 770         if max_downloads is not None:
 771             if self._num_downloads > int(max_downloads):
 772                 raise MaxDownloadsReached()
 773
 774         filename = self.prepare_filename(info_dict)
 775
 776         # Forced printings
 777         if self.params.get('forcetitle', False):
 778             self.to_stdout(info_dict['fulltitle'])
 779         if self.params.get('forceid', False):
 780             self.to_stdout(info_dict['id'])
 781         if self.params.get('forceurl', False):
 782             # For RTMP URLs, also include the playpath
 783             self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
 784         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
 785             self.to_stdout(info_dict['thumbnail'])
 786         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
 787             self.to_stdout(info_dict['description'])
 788         if self.params.get('forcefilename', False) and filename is not None:
 789             self.to_stdout(filename)
 790         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
 791             self.to_stdout(formatSeconds(info_dict['duration']))
 792         if self.params.get('forceformat', False):
 793             self.to_stdout(info_dict['format'])
 794         if self.params.get('forcejson', False):
 795             info_dict['_filename'] = filename
 796             self.to_stdout(json.dumps(info_dict))
 797
 798         # Do nothing else if in simulate mode
 799         if self.params.get('simulate', False):
 800             return
 801
 802         if filename is None:
 803             return
 804
 805         try:
 806             dn = os.path.dirname(encodeFilename(filename))
 807             if dn != '' and not os.path.exists(dn):
 808                 os.makedirs(dn)
 809         except (OSError, IOError) as err:
 810             self.report_error('unable to create directory ' + compat_str(err))
 811             return
 812
 813         if self.params.get('writedescription', False):
 814             descfn = filename + '.description'
 815             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
 816                 self.to_screen('[info] Video description is already present')
 817             else:
 818                 try:
 819                     self.to_screen('[info] Writing video description to: ' + descfn)
 820                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 821                         descfile.write(info_dict['description'])
 822                 except (KeyError, TypeError):
 823                     self.report_warning('There\'s no description to write.')
 824                 except (OSError, IOError):
 825                     self.report_error('Cannot write description file ' + descfn)
 826                     return
 827
 828         if self.params.get('writeannotations', False):
 829             annofn = filename + '.annotations.xml'
 830             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
 831                 self.to_screen('[info] Video annotations are already present')
 832             else:
 833                 try:
 834                     self.to_screen('[info] Writing video annotations to: ' + annofn)
 835                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
 836                         annofile.write(info_dict['annotations'])
 837                 except (KeyError, TypeError):
 838                     self.report_warning('There are no annotations to write.')
 839                 except (OSError, IOError):
 840                     self.report_error('Cannot write annotations file: ' + annofn)
 841                     return
 842
 843         subtitles_are_requested = any([self.params.get('writesubtitles', False),
 844                                        self.params.get('writeautomaticsub')])
 845
 846         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 847             # subtitles download errors are already managed as troubles in relevant IE
 848             # that way it will silently go on when used with unsupporting IE
 849             subtitles = info_dict['subtitles']
 850             sub_format = self.params.get('subtitlesformat', 'srt')
 851             for sub_lang in subtitles.keys():
 852                 sub = subtitles[sub_lang]
 853                 if sub is None:
 854                     continue
 855                 try:
 856                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 857                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
 858                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
 859                     else:
 860                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
 861                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 862                                 subfile.write(sub)
 863                 except (OSError, IOError):
 864                     self.report_error('Cannot write subtitles file ' + descfn)
 865                     return
 866
 867         if self.params.get('writeinfojson', False):
 868             infofn = os.path.splitext(filename)[0] + '.info.json'
 869             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
 870                 self.to_screen('[info] Video description metadata is already present')
 871             else:
 872                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
 873                 try:
 874                     write_json_file(info_dict, encodeFilename(infofn))
 875                 except (OSError, IOError):
 876                     self.report_error('Cannot write metadata to JSON file ' + infofn)
 877                     return
 878
 879         if self.params.get('writethumbnail', False):
 880             if info_dict.get('thumbnail') is not None:
 881                 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
 882                 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
 883                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
 884                     self.to_screen('[%s] %s: Thumbnail is already present' %
 885                                    (info_dict['extractor'], info_dict['id']))
 886                 else:
 887                     self.to_screen('[%s] %s: Downloading thumbnail ...' %
 888                                    (info_dict['extractor'], info_dict['id']))
 889                     try:
 890                         uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
 891                         with open(thumb_filename, 'wb') as thumbf:
 892                             shutil.copyfileobj(uf, thumbf)
 893                         self.to_screen('[%s] %s: Writing thumbnail to: %s' %
 894                             (info_dict['extractor'], info_dict['id'], thumb_filename))
 895                     except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 896                         self.report_warning('Unable to download thumbnail "%s": %s' %
 897                             (info_dict['thumbnail'], compat_str(err)))
 898
 899         if not self.params.get('skip_download', False):
 900             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
 901                 success = True
 902             else:
 903                 try:
 904                     def dl(name, info):
 905                         fd = get_suitable_downloader(info)(self, self.params)
 906                         for ph in self._progress_hooks:
 907                             fd.add_progress_hook(ph)
 908                         return fd.download(name, info)
 909                     if info_dict.get('requested_formats') is not None:
 910                         downloaded = []
 911                         success = True
 912                         merger = FFmpegMergerPP(self)
 913                         if not merger._get_executable():
 914                             postprocessors = []
 915                             self.report_warning('You have requested multiple '
 916                                 'formats but ffmpeg or avconv are not installed.'
 917                                 ' The formats won\'t be merged')
 918                         else:
 919                             postprocessors = [merger]
 920                         for f in info_dict['requested_formats']:
 921                             new_info = dict(info_dict)
 922                             new_info.update(f)
 923                             fname = self.prepare_filename(new_info)
 924                             fname = prepend_extension(fname, 'f%s' % f['format_id'])
 925                             downloaded.append(fname)
 926                             partial_success = dl(fname, new_info)
 927                             success = success and partial_success
 928                         info_dict['__postprocessors'] = postprocessors
 929                         info_dict['__files_to_merge'] = downloaded
 930                     else:
 931                         # Just a single file
 932                         success = dl(filename, info_dict)
 933                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 934                     self.report_error('unable to download video data: %s' % str(err))
 935                     return
 936                 except (OSError, IOError) as err:
 937                     raise UnavailableVideoError(err)
 938                 except (ContentTooShortError, ) as err:
 939                     self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 940                     return
 941
 942             if success:
 943                 try:
 944                     self.post_process(filename, info_dict)
 945                 except (PostProcessingError) as err:
 946                     self.report_error('postprocessing: %s' % str(err))
 947                     return
 948
 949         self.record_download_archive(info_dict)
 950
 951     def download(self, url_list):
 952         """Download a given list of URLs."""
 953         if (len(url_list) > 1 and
 954                 '%' not in self.params['outtmpl']
 955                 and self.params.get('max_downloads') != 1):
 956             raise SameFileError(self.params['outtmpl'])
 957
 958         for url in url_list:
 959             try:
 960                 #It also downloads the videos
 961                 self.extract_info(url)
 962             except UnavailableVideoError:
 963                 self.report_error('unable to download video')
 964             except MaxDownloadsReached:
 965                 self.to_screen('[info] Maximum number of downloaded files reached.')
 966                 raise
 967
 968         return self._download_retcode
 969
 970     def download_with_info_file(self, info_filename):
 971         with io.open(info_filename, 'r', encoding='utf-8') as f:
 972             info = json.load(f)
 973         try:
 974             self.process_ie_result(info, download=True)
 975         except DownloadError:
 976             webpage_url = info.get('webpage_url')
 977             if webpage_url is not None:
 978                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
 979                 return self.download([webpage_url])
 980             else:
 981                 raise
 982         return self._download_retcode
 983
 984     def post_process(self, filename, ie_info):
 985         """Run all the postprocessors on the given file."""
 986         info = dict(ie_info)
 987         info['filepath'] = filename
 988         keep_video = None
 989         pps_chain = []
 990         if ie_info.get('__postprocessors') is not None:
 991             pps_chain.extend(ie_info['__postprocessors'])
 992         pps_chain.extend(self._pps)
 993         for pp in pps_chain:
 994             try:
 995                 keep_video_wish, new_info = pp.run(info)
 996                 if keep_video_wish is not None:
 997                     if keep_video_wish:
 998                         keep_video = keep_video_wish
 999                     elif keep_video is None:
1000                         # No clear decision yet, let IE decide
1001                         keep_video = keep_video_wish
1002             except PostProcessingError as e:
1003                 self.report_error(e.msg)
1004         if keep_video is False and not self.params.get('keepvideo', False):
1005             try:
1006                 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1007                 os.remove(encodeFilename(filename))
1008             except (IOError, OSError):
1009                 self.report_warning('Unable to remove downloaded video file')
1010
1011     def _make_archive_id(self, info_dict):
1012         # Future-proof against any change in case
1013         # and backwards compatibility with prior versions
1014         extractor = info_dict.get('extractor_key')
1015         if extractor is None:
1016             if 'id' in info_dict:
1017                 extractor = info_dict.get('ie_key')  # key in a playlist
1018         if extractor is None:
1019             return None  # Incomplete video information
1020         return extractor.lower() + ' ' + info_dict['id']
1021
1022     def in_download_archive(self, info_dict):
1023         fn = self.params.get('download_archive')
1024         if fn is None:
1025             return False
1026
1027         vid_id = self._make_archive_id(info_dict)
1028         if vid_id is None:
1029             return False  # Incomplete video information
1030
1031         try:
1032             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1033                 for line in archive_file:
1034                     if line.strip() == vid_id:
1035                         return True
1036         except IOError as ioe:
1037             if ioe.errno != errno.ENOENT:
1038                 raise
1039         return False
1040
1041     def record_download_archive(self, info_dict):
1042         fn = self.params.get('download_archive')
1043         if fn is None:
1044             return
1045         vid_id = self._make_archive_id(info_dict)
1046         assert vid_id
1047         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1048             archive_file.write(vid_id + '\n')
1049
1050     @staticmethod
1051     def format_resolution(format, default='unknown'):
1052         if format.get('vcodec') == 'none':
1053             return 'audio only'
1054         if format.get('resolution') is not None:
1055             return format['resolution']
1056         if format.get('height') is not None:
1057             if format.get('width') is not None:
1058                 res = '%sx%s' % (format['width'], format['height'])
1059             else:
1060                 res = '%sp' % format['height']
1061         elif format.get('width') is not None:
1062             res = '?x%d' % format['width']
1063         else:
1064             res = default
1065         return res
1066
1067     def list_formats(self, info_dict):
1068         def format_note(fdict):
1069             res = ''
1070             if fdict.get('ext') in ['f4f', 'f4m']:
1071                 res += '(unsupported) '
1072             if fdict.get('format_note') is not None:
1073                 res += fdict['format_note'] + ' '
1074             if fdict.get('tbr') is not None:
1075                 res += '%4dk ' % fdict['tbr']
1076             if (fdict.get('vcodec') is not None and
1077                     fdict.get('vcodec') != 'none'):
1078                 res += '%-5s' % fdict['vcodec']
1079                 if fdict.get('vbr') is not None:
1080                     res += '@'
1081             elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1082                 res += 'video@'
1083             if fdict.get('vbr') is not None:
1084                 res += '%4dk' % fdict['vbr']
1085             if fdict.get('acodec') is not None:
1086                 if res:
1087                     res += ', '
1088                 res += '%-5s' % fdict['acodec']
1089             elif fdict.get('abr') is not None:
1090                 if res:
1091                     res += ', '
1092                 res += 'audio'
1093             if fdict.get('abr') is not None:
1094                 res += '@%3dk' % fdict['abr']
1095             if fdict.get('filesize') is not None:
1096                 if res:
1097                     res += ', '
1098                 res += format_bytes(fdict['filesize'])
1099             return res
1100
1101         def line(format, idlen=20):
1102             return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1103                 format['format_id'],
1104                 format['ext'],
1105                 self.format_resolution(format),
1106                 format_note(format),
1107             ))
1108
1109         formats = info_dict.get('formats', [info_dict])
1110         idlen = max(len('format code'),
1111                     max(len(f['format_id']) for f in formats))
1112         formats_s = [line(f, idlen) for f in formats]
1113         if len(formats) > 1:
1114             formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1115             formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1116
1117         header_line = line({
1118             'format_id': 'format code', 'ext': 'extension',
1119             'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1120         self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1121                        (info_dict['id'], header_line, '\n'.join(formats_s)))
1122
1123     def urlopen(self, req):
1124         """ Start an HTTP download """
1125         return self._opener.open(req)
1126
1127     def print_debug_header(self):
1128         if not self.params.get('verbose'):
1129             return
1130         write_string('[debug] youtube-dl version ' + __version__ + '\n')
1131         try:
1132             sp = subprocess.Popen(
1133                 ['git', 'rev-parse', '--short', 'HEAD'],
1134                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1135                 cwd=os.path.dirname(os.path.abspath(__file__)))
1136             out, err = sp.communicate()
1137             out = out.decode().strip()
1138             if re.match('[0-9a-f]+', out):
1139                 write_string('[debug] Git HEAD: ' + out + '\n')
1140         except:
1141             try:
1142                 sys.exc_clear()
1143             except:
1144                 pass
1145         write_string('[debug] Python version %s - %s' %
1146                      (platform.python_version(), platform_name()) + '\n')
1147
1148         proxy_map = {}
1149         for handler in self._opener.handlers:
1150             if hasattr(handler, 'proxies'):
1151                 proxy_map.update(handler.proxies)
1152         write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1153
1154     def _setup_opener(self):
1155         timeout_val = self.params.get('socket_timeout')
1156         timeout = 600 if timeout_val is None else float(timeout_val)
1157
1158         opts_cookiefile = self.params.get('cookiefile')
1159         opts_proxy = self.params.get('proxy')
1160
1161         if opts_cookiefile is None:
1162             self.cookiejar = compat_cookiejar.CookieJar()
1163         else:
1164             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1165                 opts_cookiefile)
1166             if os.access(opts_cookiefile, os.R_OK):
1167                 self.cookiejar.load()
1168
1169         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1170             self.cookiejar)
1171         if opts_proxy is not None:
1172             if opts_proxy == '':
1173                 proxies = {}
1174             else:
1175                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1176         else:
1177             proxies = compat_urllib_request.getproxies()
1178             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1179             if 'http' in proxies and 'https' not in proxies:
1180                 proxies['https'] = proxies['http']
1181         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1182
1183         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1184         https_handler = make_HTTPS_handler(
1185             self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1186         ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1187         opener = compat_urllib_request.build_opener(
1188             https_handler, proxy_handler, cookie_processor, ydlh)
1189         # Delete the default user-agent header, which would otherwise apply in
1190         # cases where our custom HTTP handler doesn't come into play
1191         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1192         opener.addheaders = []
1193         self._opener = opener
1194
1195         # TODO remove this global modification
1196         compat_urllib_request.install_opener(opener)
1197         socket.setdefaulttimeout(timeout)