youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import datetime
   8 import errno
   9 import io
  10 import itertools
  11 import json
  12 import locale
  13 import os
  14 import platform
  15 import re
  16 import shutil
  17 import subprocess
  18 import socket
  19 import sys
  20 import time
  21 import traceback
  22
  23 if os.name == 'nt':
  24     import ctypes
  25
  26 from .compat import (
  27     compat_cookiejar,
  28     compat_expanduser,
  29     compat_http_client,
  30     compat_kwargs,
  31     compat_str,
  32     compat_urllib_error,
  33     compat_urllib_request,
  34 )
  35 from .utils import (
  36     escape_url,
  37     ContentTooShortError,
  38     date_from_str,
  39     DateRange,
  40     DEFAULT_OUTTMPL,
  41     determine_ext,
  42     DownloadError,
  43     encodeFilename,
  44     ExtractorError,
  45     format_bytes,
  46     formatSeconds,
  47     get_term_width,
  48     locked_file,
  49     make_HTTPS_handler,
  50     MaxDownloadsReached,
  51     PagedList,
  52     PostProcessingError,
  53     platform_name,
  54     preferredencoding,
  55     SameFileError,
  56     sanitize_filename,
  57     subtitles_filename,
  58     takewhile_inclusive,
  59     UnavailableVideoError,
  60     url_basename,
  61     write_json_file,
  62     write_string,
  63     YoutubeDLHandler,
  64     prepend_extension,
  65     args_to_str,
  66     age_restricted,
  67 )
  68 from .cache import Cache
  69 from .extractor import get_info_extractor, gen_extractors
  70 from .downloader import get_suitable_downloader
  71 from .downloader.rtmp import rtmpdump_version
  72 from .postprocessor import (
  73     FFmpegMergerPP,
  74     FFmpegPostProcessor,
  75     get_postprocessor,
  76 )
  77 from .version import __version__
  78
  79
  80 class YoutubeDL(object):
  81     """YoutubeDL class.
  82
  83     YoutubeDL objects are the ones responsible of downloading the
  84     actual video file and writing it to disk if the user has requested
  85     it, among some other tasks. In most cases there should be one per
  86     program. As, given a video URL, the downloader doesn't know how to
  87     extract all the needed information, task that InfoExtractors do, it
  88     has to pass the URL to one of them.
  89
  90     For this, YoutubeDL objects have a method that allows
  91     InfoExtractors to be registered in a given order. When it is passed
  92     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  93     finds that reports being able to handle it. The InfoExtractor extracts
  94     all the information about the video or videos the URL refers to, and
  95     YoutubeDL process the extracted information, possibly using a File
  96     Downloader to download the video.
  97
  98     YoutubeDL objects accept a lot of parameters. In order not to saturate
  99     the object constructor with arguments, it receives a dictionary of
 100     options instead. These options are available through the params
 101     attribute for the InfoExtractors to use. The YoutubeDL also
 102     registers itself as the downloader in charge for the InfoExtractors
 103     that are added to it, so this is a "mutual registration".
 104
 105     Available options:
 106
 107     username:          Username for authentication purposes.
 108     password:          Password for authentication purposes.
 109     videopassword:     Password for acces a video.
 110     usenetrc:          Use netrc for authentication instead.
 111     verbose:           Print additional info to stdout.
 112     quiet:             Do not print messages to stdout.
 113     no_warnings:       Do not print out anything for warnings.
 114     forceurl:          Force printing final URL.
 115     forcetitle:        Force printing title.
 116     forceid:           Force printing ID.
 117     forcethumbnail:    Force printing thumbnail URL.
 118     forcedescription:  Force printing description.
 119     forcefilename:     Force printing final filename.
 120     forceduration:     Force printing duration.
 121     forcejson:         Force printing info_dict as JSON.
 122     dump_single_json:  Force printing the info_dict of the whole playlist
 123                        (or video) as a single JSON line.
 124     simulate:          Do not download the video files.
 125     format:            Video format code. See options.py for more information.
 126     format_limit:      Highest quality format to try.
 127     outtmpl:           Template for output names.
 128     restrictfilenames: Do not allow "&" and spaces in file names
 129     ignoreerrors:      Do not stop on download errors.
 130     nooverwrites:      Prevent overwriting files.
 131     playliststart:     Playlist item to start at.
 132     playlistend:       Playlist item to end at.
 133     playlistreverse:   Download playlist items in reverse order.
 134     matchtitle:        Download only matching titles.
 135     rejecttitle:       Reject downloads for matching titles.
 136     logger:            Log messages to a logging.Logger instance.
 137     logtostderr:       Log messages to stderr instead of stdout.
 138     writedescription:  Write the video description to a .description file
 139     writeinfojson:     Write the video description to a .info.json file
 140     writeannotations:  Write the video annotations to a .annotations.xml file
 141     writethumbnail:    Write the thumbnail image to a file
 142     writesubtitles:    Write the video subtitles to a file
 143     writeautomaticsub: Write the automatic subtitles to a file
 144     allsubtitles:      Downloads all the subtitles of the video
 145                        (requires writesubtitles or writeautomaticsub)
 146     listsubtitles:     Lists all available subtitles for the video
 147     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 148     subtitleslangs:    List of languages of the subtitles to download
 149     keepvideo:         Keep the video file after post-processing
 150     daterange:         A DateRange object, download only if the upload_date is in the range.
 151     skip_download:     Skip the actual download of the video file
 152     cachedir:          Location of the cache files in the filesystem.
 153                        False to disable filesystem cache.
 154     noplaylist:        Download single video instead of a playlist if in doubt.
 155     age_limit:         An integer representing the user's age in years.
 156                        Unsuitable videos for the given age are skipped.
 157     min_views:         An integer representing the minimum view count the video
 158                        must have in order to not be skipped.
 159                        Videos without view count information are always
 160                        downloaded. None for no limit.
 161     max_views:         An integer representing the maximum view count.
 162                        Videos that are more popular than that are not
 163                        downloaded.
 164                        Videos without view count information are always
 165                        downloaded. None for no limit.
 166     download_archive:  File name of a file where all downloads are recorded.
 167                        Videos already present in the file are not downloaded
 168                        again.
 169     cookiefile:        File name where cookies should be read from and dumped to.
 170     nocheckcertificate:Do not verify SSL certificates
 171     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 172                        At the moment, this is only supported by YouTube.
 173     proxy:             URL of the proxy server to use
 174     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 175     bidi_workaround:   Work around buggy terminals without bidirectional text
 176                        support, using fridibi
 177     debug_printtraffic:Print out sent and received HTTP traffic
 178     include_ads:       Download ads as well
 179     default_search:    Prepend this string if an input url is not valid.
 180                        'auto' for elaborate guessing
 181     encoding:          Use this encoding instead of the system-specified.
 182     extract_flat:      Do not resolve URLs, return the immediate result.
 183                        Pass in 'in_playlist' to only show this behavior for
 184                        playlist items.
 185     postprocessors:    A list of dictionaries, each with an entry
 186                        * key:  The name of the postprocessor. See
 187                                youtube_dl/postprocessor/__init__.py for a list.
 188                        as well as any further keyword arguments for the
 189                        postprocessor.
 190     progress_hooks:    A list of functions that get called on download
 191                        progress, with a dictionary with the entries
 192                        * filename: The final filename
 193                        * status: One of "downloading" and "finished"
 194
 195                        The dict may also have some of the following entries:
 196
 197                        * downloaded_bytes: Bytes on disk
 198                        * total_bytes: Size of the whole file, None if unknown
 199                        * tmpfilename: The filename we're currently writing to
 200                        * eta: The estimated time in seconds, None if unknown
 201                        * speed: The download speed in bytes/second, None if
 202                                 unknown
 203
 204                        Progress hooks are guaranteed to be called at least once
 205                        (with status "finished") if the download is successful.
 206
 207
 208     The following parameters are not used by YoutubeDL itself, they are used by
 209     the FileDownloader:
 210     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 211     noresizebuffer, retries, continuedl, noprogress, consoletitle
 212
 213     The following options are used by the post processors:
 214     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 215                        otherwise prefer avconv.
 216     exec_cmd:          Arbitrary command to run after downloading
 217     """
 218
 219     params = None
 220     _ies = []
 221     _pps = []
 222     _download_retcode = None
 223     _num_downloads = None
 224     _screen_file = None
 225
 226     def __init__(self, params=None, auto_init=True):
 227         """Create a FileDownloader object with the given options."""
 228         if params is None:
 229             params = {}
 230         self._ies = []
 231         self._ies_instances = {}
 232         self._pps = []
 233         self._progress_hooks = []
 234         self._download_retcode = 0
 235         self._num_downloads = 0
 236         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 237         self._err_file = sys.stderr
 238         self.params = params
 239         self.cache = Cache(self)
 240
 241         if params.get('bidi_workaround', False):
 242             try:
 243                 import pty
 244                 master, slave = pty.openpty()
 245                 width = get_term_width()
 246                 if width is None:
 247                     width_args = []
 248                 else:
 249                     width_args = ['-w', str(width)]
 250                 sp_kwargs = dict(
 251                     stdin=subprocess.PIPE,
 252                     stdout=slave,
 253                     stderr=self._err_file)
 254                 try:
 255                     self._output_process = subprocess.Popen(
 256                         ['bidiv'] + width_args, **sp_kwargs
 257                     )
 258                 except OSError:
 259                     self._output_process = subprocess.Popen(
 260                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 261                 self._output_channel = os.fdopen(master, 'rb')
 262             except OSError as ose:
 263                 if ose.errno == 2:
 264                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 265                 else:
 266                     raise
 267
 268         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 269                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 270                 and not params.get('restrictfilenames', False)):
 271             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 272             self.report_warning(
 273                 'Assuming --restrict-filenames since file system encoding '
 274                 'cannot encode all characters. '
 275                 'Set the LC_ALL environment variable to fix this.')
 276             self.params['restrictfilenames'] = True
 277
 278         if '%(stitle)s' in self.params.get('outtmpl', ''):
 279             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 280
 281         self._setup_opener()
 282
 283         if auto_init:
 284             self.print_debug_header()
 285             self.add_default_info_extractors()
 286
 287         for pp_def_raw in self.params.get('postprocessors', []):
 288             pp_class = get_postprocessor(pp_def_raw['key'])
 289             pp_def = dict(pp_def_raw)
 290             del pp_def['key']
 291             pp = pp_class(self, **compat_kwargs(pp_def))
 292             self.add_post_processor(pp)
 293
 294         for ph in self.params.get('progress_hooks', []):
 295             self.add_progress_hook(ph)
 296
 297     def warn_if_short_id(self, argv):
 298         # short YouTube ID starting with dash?
 299         idxs = [
 300             i for i, a in enumerate(argv)
 301             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 302         if idxs:
 303             correct_argv = (
 304                 ['youtube-dl'] +
 305                 [a for i, a in enumerate(argv) if i not in idxs] +
 306                 ['--'] + [argv[i] for i in idxs]
 307             )
 308             self.report_warning(
 309                 'Long argument string detected. '
 310                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 311                 args_to_str(correct_argv))
 312
 313     def add_info_extractor(self, ie):
 314         """Add an InfoExtractor object to the end of the list."""
 315         self._ies.append(ie)
 316         self._ies_instances[ie.ie_key()] = ie
 317         ie.set_downloader(self)
 318
 319     def get_info_extractor(self, ie_key):
 320         """
 321         Get an instance of an IE with name ie_key, it will try to get one from
 322         the _ies list, if there's no instance it will create a new one and add
 323         it to the extractor list.
 324         """
 325         ie = self._ies_instances.get(ie_key)
 326         if ie is None:
 327             ie = get_info_extractor(ie_key)()
 328             self.add_info_extractor(ie)
 329         return ie
 330
 331     def add_default_info_extractors(self):
 332         """
 333         Add the InfoExtractors returned by gen_extractors to the end of the list
 334         """
 335         for ie in gen_extractors():
 336             self.add_info_extractor(ie)
 337
 338     def add_post_processor(self, pp):
 339         """Add a PostProcessor object to the end of the chain."""
 340         self._pps.append(pp)
 341         pp.set_downloader(self)
 342
 343     def add_progress_hook(self, ph):
 344         """Add the progress hook (currently only for the file downloader)"""
 345         self._progress_hooks.append(ph)
 346
 347     def _bidi_workaround(self, message):
 348         if not hasattr(self, '_output_channel'):
 349             return message
 350
 351         assert hasattr(self, '_output_process')
 352         assert isinstance(message, compat_str)
 353         line_count = message.count('\n') + 1
 354         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 355         self._output_process.stdin.flush()
 356         res = ''.join(self._output_channel.readline().decode('utf-8')
 357                       for _ in range(line_count))
 358         return res[:-len('\n')]
 359
 360     def to_screen(self, message, skip_eol=False):
 361         """Print message to stdout if not in quiet mode."""
 362         return self.to_stdout(message, skip_eol, check_quiet=True)
 363
 364     def _write_string(self, s, out=None):
 365         write_string(s, out=out, encoding=self.params.get('encoding'))
 366
 367     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 368         """Print message to stdout if not in quiet mode."""
 369         if self.params.get('logger'):
 370             self.params['logger'].debug(message)
 371         elif not check_quiet or not self.params.get('quiet', False):
 372             message = self._bidi_workaround(message)
 373             terminator = ['\n', ''][skip_eol]
 374             output = message + terminator
 375
 376             self._write_string(output, self._screen_file)
 377
 378     def to_stderr(self, message):
 379         """Print message to stderr."""
 380         assert isinstance(message, compat_str)
 381         if self.params.get('logger'):
 382             self.params['logger'].error(message)
 383         else:
 384             message = self._bidi_workaround(message)
 385             output = message + '\n'
 386             self._write_string(output, self._err_file)
 387
 388     def to_console_title(self, message):
 389         if not self.params.get('consoletitle', False):
 390             return
 391         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 392             # c_wchar_p() might not be necessary if `message` is
 393             # already of type unicode()
 394             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 395         elif 'TERM' in os.environ:
 396             self._write_string('\033]0;%s\007' % message, self._screen_file)
 397
 398     def save_console_title(self):
 399         if not self.params.get('consoletitle', False):
 400             return
 401         if 'TERM' in os.environ:
 402             # Save the title on stack
 403             self._write_string('\033[22;0t', self._screen_file)
 404
 405     def restore_console_title(self):
 406         if not self.params.get('consoletitle', False):
 407             return
 408         if 'TERM' in os.environ:
 409             # Restore the title from stack
 410             self._write_string('\033[23;0t', self._screen_file)
 411
 412     def __enter__(self):
 413         self.save_console_title()
 414         return self
 415
 416     def __exit__(self, *args):
 417         self.restore_console_title()
 418
 419         if self.params.get('cookiefile') is not None:
 420             self.cookiejar.save()
 421
 422     def trouble(self, message=None, tb=None):
 423         """Determine action to take when a download problem appears.
 424
 425         Depending on if the downloader has been configured to ignore
 426         download errors or not, this method may throw an exception or
 427         not when errors are found, after printing the message.
 428
 429         tb, if given, is additional traceback information.
 430         """
 431         if message is not None:
 432             self.to_stderr(message)
 433         if self.params.get('verbose'):
 434             if tb is None:
 435                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 436                     tb = ''
 437                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 438                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 439                     tb += compat_str(traceback.format_exc())
 440                 else:
 441                     tb_data = traceback.format_list(traceback.extract_stack())
 442                     tb = ''.join(tb_data)
 443             self.to_stderr(tb)
 444         if not self.params.get('ignoreerrors', False):
 445             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 446                 exc_info = sys.exc_info()[1].exc_info
 447             else:
 448                 exc_info = sys.exc_info()
 449             raise DownloadError(message, exc_info)
 450         self._download_retcode = 1
 451
 452     def report_warning(self, message):
 453         '''
 454         Print the message to stderr, it will be prefixed with 'WARNING:'
 455         If stderr is a tty file the 'WARNING:' will be colored
 456         '''
 457         if self.params.get('logger') is not None:
 458             self.params['logger'].warning(message)
 459         else:
 460             if self.params.get('no_warnings'):
 461                 return
 462             if self._err_file.isatty() and os.name != 'nt':
 463                 _msg_header = '\033[0;33mWARNING:\033[0m'
 464             else:
 465                 _msg_header = 'WARNING:'
 466             warning_message = '%s %s' % (_msg_header, message)
 467             self.to_stderr(warning_message)
 468
 469     def report_error(self, message, tb=None):
 470         '''
 471         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 472         in red if stderr is a tty file.
 473         '''
 474         if self._err_file.isatty() and os.name != 'nt':
 475             _msg_header = '\033[0;31mERROR:\033[0m'
 476         else:
 477             _msg_header = 'ERROR:'
 478         error_message = '%s %s' % (_msg_header, message)
 479         self.trouble(error_message, tb)
 480
 481     def report_file_already_downloaded(self, file_name):
 482         """Report file has already been fully downloaded."""
 483         try:
 484             self.to_screen('[download] %s has already been downloaded' % file_name)
 485         except UnicodeEncodeError:
 486             self.to_screen('[download] The file has already been downloaded')
 487
 488     def prepare_filename(self, info_dict):
 489         """Generate the output filename."""
 490         try:
 491             template_dict = dict(info_dict)
 492
 493             template_dict['epoch'] = int(time.time())
 494             autonumber_size = self.params.get('autonumber_size')
 495             if autonumber_size is None:
 496                 autonumber_size = 5
 497             autonumber_templ = '%0' + str(autonumber_size) + 'd'
 498             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 499             if template_dict.get('playlist_index') is not None:
 500                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
 501             if template_dict.get('resolution') is None:
 502                 if template_dict.get('width') and template_dict.get('height'):
 503                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
 504                 elif template_dict.get('height'):
 505                     template_dict['resolution'] = '%sp' % template_dict['height']
 506                 elif template_dict.get('width'):
 507                     template_dict['resolution'] = '?x%d' % template_dict['width']
 508
 509             sanitize = lambda k, v: sanitize_filename(
 510                 compat_str(v),
 511                 restricted=self.params.get('restrictfilenames'),
 512                 is_id=(k == 'id'))
 513             template_dict = dict((k, sanitize(k, v))
 514                                  for k, v in template_dict.items()
 515                                  if v is not None)
 516             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 517
 518             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 519             tmpl = compat_expanduser(outtmpl)
 520             filename = tmpl % template_dict
 521             return filename
 522         except ValueError as err:
 523             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 524             return None
 525
 526     def _match_entry(self, info_dict):
 527         """ Returns None iff the file should be downloaded """
 528
 529         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 530         if 'title' in info_dict:
 531             # This can happen when we're just evaluating the playlist
 532             title = info_dict['title']
 533             matchtitle = self.params.get('matchtitle', False)
 534             if matchtitle:
 535                 if not re.search(matchtitle, title, re.IGNORECASE):
 536                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 537             rejecttitle = self.params.get('rejecttitle', False)
 538             if rejecttitle:
 539                 if re.search(rejecttitle, title, re.IGNORECASE):
 540                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 541         date = info_dict.get('upload_date', None)
 542         if date is not None:
 543             dateRange = self.params.get('daterange', DateRange())
 544             if date not in dateRange:
 545                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 546         view_count = info_dict.get('view_count', None)
 547         if view_count is not None:
 548             min_views = self.params.get('min_views')
 549             if min_views is not None and view_count < min_views:
 550                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 551             max_views = self.params.get('max_views')
 552             if max_views is not None and view_count > max_views:
 553                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 554         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 555             return 'Skipping "%s" because it is age restricted' % title
 556         if self.in_download_archive(info_dict):
 557             return '%s has already been recorded in archive' % video_title
 558         return None
 559
 560     @staticmethod
 561     def add_extra_info(info_dict, extra_info):
 562         '''Set the keys from extra_info in info dict if they are missing'''
 563         for key, value in extra_info.items():
 564             info_dict.setdefault(key, value)
 565
 566     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 567                      process=True):
 568         '''
 569         Returns a list with a dictionary for each video we find.
 570         If 'download', also downloads the videos.
 571         extra_info is a dict containing the extra values to add to each result
 572          '''
 573
 574         if ie_key:
 575             ies = [self.get_info_extractor(ie_key)]
 576         else:
 577             ies = self._ies
 578
 579         for ie in ies:
 580             if not ie.suitable(url):
 581                 continue
 582
 583             if not ie.working():
 584                 self.report_warning('The program functionality for this site has been marked as broken, '
 585                                     'and will probably not work.')
 586
 587             try:
 588                 ie_result = ie.extract(url)
 589                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
 590                     break
 591                 if isinstance(ie_result, list):
 592                     # Backwards compatibility: old IE result format
 593                     ie_result = {
 594                         '_type': 'compat_list',
 595                         'entries': ie_result,
 596                     }
 597                 self.add_default_extra_info(ie_result, ie, url)
 598                 if process:
 599                     return self.process_ie_result(ie_result, download, extra_info)
 600                 else:
 601                     return ie_result
 602             except ExtractorError as de:  # An error we somewhat expected
 603                 self.report_error(compat_str(de), de.format_traceback())
 604                 break
 605             except MaxDownloadsReached:
 606                 raise
 607             except Exception as e:
 608                 if self.params.get('ignoreerrors', False):
 609                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 610                     break
 611                 else:
 612                     raise
 613         else:
 614             self.report_error('no suitable InfoExtractor for URL %s' % url)
 615
 616     def add_default_extra_info(self, ie_result, ie, url):
 617         self.add_extra_info(ie_result, {
 618             'extractor': ie.IE_NAME,
 619             'webpage_url': url,
 620             'webpage_url_basename': url_basename(url),
 621             'extractor_key': ie.ie_key(),
 622         })
 623
 624     def process_ie_result(self, ie_result, download=True, extra_info={}):
 625         """
 626         Take the result of the ie(may be modified) and resolve all unresolved
 627         references (URLs, playlist items).
 628
 629         It will also download the videos if 'download'.
 630         Returns the resolved ie_result.
 631         """
 632
 633         result_type = ie_result.get('_type', 'video')
 634
 635         if result_type in ('url', 'url_transparent'):
 636             extract_flat = self.params.get('extract_flat', False)
 637             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
 638                     extract_flat is True):
 639                 if self.params.get('forcejson', False):
 640                     self.to_stdout(json.dumps(ie_result))
 641                 return ie_result
 642
 643         if result_type == 'video':
 644             self.add_extra_info(ie_result, extra_info)
 645             return self.process_video_result(ie_result, download=download)
 646         elif result_type == 'url':
 647             # We have to add extra_info to the results because it may be
 648             # contained in a playlist
 649             return self.extract_info(ie_result['url'],
 650                                      download,
 651                                      ie_key=ie_result.get('ie_key'),
 652                                      extra_info=extra_info)
 653         elif result_type == 'url_transparent':
 654             # Use the information from the embedding page
 655             info = self.extract_info(
 656                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 657                 extra_info=extra_info, download=False, process=False)
 658
 659             force_properties = dict(
 660                 (k, v) for k, v in ie_result.items() if v is not None)
 661             for f in ('_type', 'url'):
 662                 if f in force_properties:
 663                     del force_properties[f]
 664             new_result = info.copy()
 665             new_result.update(force_properties)
 666
 667             assert new_result.get('_type') != 'url_transparent'
 668
 669             return self.process_ie_result(
 670                 new_result, download=download, extra_info=extra_info)
 671         elif result_type == 'playlist' or result_type == 'multi_video':
 672             # We process each entry in the playlist
 673             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 674             self.to_screen('[download] Downloading playlist: %s' % playlist)
 675
 676             playlist_results = []
 677
 678             playliststart = self.params.get('playliststart', 1) - 1
 679             playlistend = self.params.get('playlistend', None)
 680             # For backwards compatibility, interpret -1 as whole list
 681             if playlistend == -1:
 682                 playlistend = None
 683
 684             ie_entries = ie_result['entries']
 685             if isinstance(ie_entries, list):
 686                 n_all_entries = len(ie_entries)
 687                 entries = ie_entries[playliststart:playlistend]
 688                 n_entries = len(entries)
 689                 self.to_screen(
 690                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
 691                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
 692             elif isinstance(ie_entries, PagedList):
 693                 entries = ie_entries.getslice(
 694                     playliststart, playlistend)
 695                 n_entries = len(entries)
 696                 self.to_screen(
 697                     "[%s] playlist %s: Downloading %d videos" %
 698                     (ie_result['extractor'], playlist, n_entries))
 699             else:  # iterable
 700                 entries = list(itertools.islice(
 701                     ie_entries, playliststart, playlistend))
 702                 n_entries = len(entries)
 703                 self.to_screen(
 704                     "[%s] playlist %s: Downloading %d videos" %
 705                     (ie_result['extractor'], playlist, n_entries))
 706
 707             if self.params.get('playlistreverse', False):
 708                 entries = entries[::-1]
 709
 710             for i, entry in enumerate(entries, 1):
 711                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
 712                 extra = {
 713                     'n_entries': n_entries,
 714                     'playlist': playlist,
 715                     'playlist_id': ie_result.get('id'),
 716                     'playlist_title': ie_result.get('title'),
 717                     'playlist_index': i + playliststart,
 718                     'extractor': ie_result['extractor'],
 719                     'webpage_url': ie_result['webpage_url'],
 720                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 721                     'extractor_key': ie_result['extractor_key'],
 722                 }
 723
 724                 reason = self._match_entry(entry)
 725                 if reason is not None:
 726                     self.to_screen('[download] ' + reason)
 727                     continue
 728
 729                 entry_result = self.process_ie_result(entry,
 730                                                       download=download,
 731                                                       extra_info=extra)
 732                 playlist_results.append(entry_result)
 733             ie_result['entries'] = playlist_results
 734             return ie_result
 735         elif result_type == 'compat_list':
 736             self.report_warning(
 737                 'Extractor %s returned a compat_list result. '
 738                 'It needs to be updated.' % ie_result.get('extractor'))
 739
 740             def _fixup(r):
 741                 self.add_extra_info(
 742                     r,
 743                     {
 744                         'extractor': ie_result['extractor'],
 745                         'webpage_url': ie_result['webpage_url'],
 746                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 747                         'extractor_key': ie_result['extractor_key'],
 748                     }
 749                 )
 750                 return r
 751             ie_result['entries'] = [
 752                 self.process_ie_result(_fixup(r), download, extra_info)
 753                 for r in ie_result['entries']
 754             ]
 755             return ie_result
 756         else:
 757             raise Exception('Invalid result type: %s' % result_type)
 758
 759     def select_format(self, format_spec, available_formats):
 760         if format_spec == 'best' or format_spec is None:
 761             return available_formats[-1]
 762         elif format_spec == 'worst':
 763             return available_formats[0]
 764         elif format_spec == 'bestaudio':
 765             audio_formats = [
 766                 f for f in available_formats
 767                 if f.get('vcodec') == 'none']
 768             if audio_formats:
 769                 return audio_formats[-1]
 770         elif format_spec == 'worstaudio':
 771             audio_formats = [
 772                 f for f in available_formats
 773                 if f.get('vcodec') == 'none']
 774             if audio_formats:
 775                 return audio_formats[0]
 776         elif format_spec == 'bestvideo':
 777             video_formats = [
 778                 f for f in available_formats
 779                 if f.get('acodec') == 'none']
 780             if video_formats:
 781                 return video_formats[-1]
 782         elif format_spec == 'worstvideo':
 783             video_formats = [
 784                 f for f in available_formats
 785                 if f.get('acodec') == 'none']
 786             if video_formats:
 787                 return video_formats[0]
 788         else:
 789             extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
 790             if format_spec in extensions:
 791                 filter_f = lambda f: f['ext'] == format_spec
 792             else:
 793                 filter_f = lambda f: f['format_id'] == format_spec
 794             matches = list(filter(filter_f, available_formats))
 795             if matches:
 796                 return matches[-1]
 797         return None
 798
 799     def process_video_result(self, info_dict, download=True):
 800         assert info_dict.get('_type', 'video') == 'video'
 801
 802         if 'id' not in info_dict:
 803             raise ExtractorError('Missing "id" field in extractor result')
 804         if 'title' not in info_dict:
 805             raise ExtractorError('Missing "title" field in extractor result')
 806
 807         if 'playlist' not in info_dict:
 808             # It isn't part of a playlist
 809             info_dict['playlist'] = None
 810             info_dict['playlist_index'] = None
 811
 812         thumbnails = info_dict.get('thumbnails')
 813         if thumbnails:
 814             thumbnails.sort(key=lambda t: (
 815                 t.get('width'), t.get('height'), t.get('url')))
 816             for t in thumbnails:
 817                 if 'width' in t and 'height' in t:
 818                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
 819
 820         if thumbnails and 'thumbnail' not in info_dict:
 821             info_dict['thumbnail'] = thumbnails[-1]['url']
 822
 823         if 'display_id' not in info_dict and 'id' in info_dict:
 824             info_dict['display_id'] = info_dict['id']
 825
 826         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
 827             # Working around negative timestamps in Windows
 828             # (see http://bugs.python.org/issue1646728)
 829             if info_dict['timestamp'] < 0 and os.name == 'nt':
 830                 info_dict['timestamp'] = 0
 831             upload_date = datetime.datetime.utcfromtimestamp(
 832                 info_dict['timestamp'])
 833             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
 834
 835         # This extractors handle format selection themselves
 836         if info_dict['extractor'] in ['Youku']:
 837             if download:
 838                 self.process_info(info_dict)
 839             return info_dict
 840
 841         # We now pick which formats have to be downloaded
 842         if info_dict.get('formats') is None:
 843             # There's only one format available
 844             formats = [info_dict]
 845         else:
 846             formats = info_dict['formats']
 847
 848         if not formats:
 849             raise ExtractorError('No video formats found!')
 850
 851         # We check that all the formats have the format and format_id fields
 852         for i, format in enumerate(formats):
 853             if 'url' not in format:
 854                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
 855
 856             if format.get('format_id') is None:
 857                 format['format_id'] = compat_str(i)
 858             if format.get('format') is None:
 859                 format['format'] = '{id} - {res}{note}'.format(
 860                     id=format['format_id'],
 861                     res=self.format_resolution(format),
 862                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
 863                 )
 864             # Automatically determine file extension if missing
 865             if 'ext' not in format:
 866                 format['ext'] = determine_ext(format['url']).lower()
 867
 868         format_limit = self.params.get('format_limit', None)
 869         if format_limit:
 870             formats = list(takewhile_inclusive(
 871                 lambda f: f['format_id'] != format_limit, formats
 872             ))
 873
 874         # TODO Central sorting goes here
 875
 876         if formats[0] is not info_dict:
 877             # only set the 'formats' fields if the original info_dict list them
 878             # otherwise we end up with a circular reference, the first (and unique)
 879             # element in the 'formats' field in info_dict is info_dict itself,
 880             # wich can't be exported to json
 881             info_dict['formats'] = formats
 882         if self.params.get('listformats', None):
 883             self.list_formats(info_dict)
 884             return
 885
 886         req_format = self.params.get('format')
 887         if req_format is None:
 888             req_format = 'best'
 889         formats_to_download = []
 890         # The -1 is for supporting YoutubeIE
 891         if req_format in ('-1', 'all'):
 892             formats_to_download = formats
 893         else:
 894             for rfstr in req_format.split(','):
 895                 # We can accept formats requested in the format: 34/5/best, we pick
 896                 # the first that is available, starting from left
 897                 req_formats = rfstr.split('/')
 898                 for rf in req_formats:
 899                     if re.match(r'.+?\+.+?', rf) is not None:
 900                         # Two formats have been requested like '137+139'
 901                         format_1, format_2 = rf.split('+')
 902                         formats_info = (self.select_format(format_1, formats),
 903                                         self.select_format(format_2, formats))
 904                         if all(formats_info):
 905                             # The first format must contain the video and the
 906                             # second the audio
 907                             if formats_info[0].get('vcodec') == 'none':
 908                                 self.report_error('The first format must '
 909                                                   'contain the video, try using '
 910                                                   '"-f %s+%s"' % (format_2, format_1))
 911                                 return
 912                             selected_format = {
 913                                 'requested_formats': formats_info,
 914                                 'format': rf,
 915                                 'ext': self.params['merge_output_format'] if self.params['merge_output_format'] is not None else formats_info[0]['ext'],
 916                             }
 917                         else:
 918                             selected_format = None
 919                     else:
 920                         selected_format = self.select_format(rf, formats)
 921                     if selected_format is not None:
 922                         formats_to_download.append(selected_format)
 923                         break
 924         if not formats_to_download:
 925             raise ExtractorError('requested format not available',
 926                                  expected=True)
 927
 928         if download:
 929             if len(formats_to_download) > 1:
 930                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
 931             for format in formats_to_download:
 932                 new_info = dict(info_dict)
 933                 new_info.update(format)
 934                 self.process_info(new_info)
 935         # We update the info dict with the best quality format (backwards compatibility)
 936         info_dict.update(formats_to_download[-1])
 937         return info_dict
 938
 939     def process_info(self, info_dict):
 940         """Process a single resolved IE result."""
 941
 942         assert info_dict.get('_type', 'video') == 'video'
 943
 944         max_downloads = self.params.get('max_downloads')
 945         if max_downloads is not None:
 946             if self._num_downloads >= int(max_downloads):
 947                 raise MaxDownloadsReached()
 948
 949         info_dict['fulltitle'] = info_dict['title']
 950         if len(info_dict['title']) > 200:
 951             info_dict['title'] = info_dict['title'][:197] + '...'
 952
 953         # Keep for backwards compatibility
 954         info_dict['stitle'] = info_dict['title']
 955
 956         if 'format' not in info_dict:
 957             info_dict['format'] = info_dict['ext']
 958
 959         reason = self._match_entry(info_dict)
 960         if reason is not None:
 961             self.to_screen('[download] ' + reason)
 962             return
 963
 964         self._num_downloads += 1
 965
 966         filename = self.prepare_filename(info_dict)
 967
 968         # Forced printings
 969         if self.params.get('forcetitle', False):
 970             self.to_stdout(info_dict['fulltitle'])
 971         if self.params.get('forceid', False):
 972             self.to_stdout(info_dict['id'])
 973         if self.params.get('forceurl', False):
 974             if info_dict.get('requested_formats') is not None:
 975                 for f in info_dict['requested_formats']:
 976                     self.to_stdout(f['url'] + f.get('play_path', ''))
 977             else:
 978                 # For RTMP URLs, also include the playpath
 979                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
 980         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
 981             self.to_stdout(info_dict['thumbnail'])
 982         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
 983             self.to_stdout(info_dict['description'])
 984         if self.params.get('forcefilename', False) and filename is not None:
 985             self.to_stdout(filename)
 986         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
 987             self.to_stdout(formatSeconds(info_dict['duration']))
 988         if self.params.get('forceformat', False):
 989             self.to_stdout(info_dict['format'])
 990         if self.params.get('forcejson', False):
 991             info_dict['_filename'] = filename
 992             self.to_stdout(json.dumps(info_dict))
 993         if self.params.get('dump_single_json', False):
 994             info_dict['_filename'] = filename
 995
 996         # Do nothing else if in simulate mode
 997         if self.params.get('simulate', False):
 998             return
 999
1000         if filename is None:
1001             return
1002
1003         try:
1004             dn = os.path.dirname(encodeFilename(filename))
1005             if dn and not os.path.exists(dn):
1006                 os.makedirs(dn)
1007         except (OSError, IOError) as err:
1008             self.report_error('unable to create directory ' + compat_str(err))
1009             return
1010
1011         if self.params.get('writedescription', False):
1012             descfn = filename + '.description'
1013             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1014                 self.to_screen('[info] Video description is already present')
1015             elif info_dict.get('description') is None:
1016                 self.report_warning('There\'s no description to write.')
1017             else:
1018                 try:
1019                     self.to_screen('[info] Writing video description to: ' + descfn)
1020                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1021                         descfile.write(info_dict['description'])
1022                 except (OSError, IOError):
1023                     self.report_error('Cannot write description file ' + descfn)
1024                     return
1025
1026         if self.params.get('writeannotations', False):
1027             annofn = filename + '.annotations.xml'
1028             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1029                 self.to_screen('[info] Video annotations are already present')
1030             else:
1031                 try:
1032                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1033                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1034                         annofile.write(info_dict['annotations'])
1035                 except (KeyError, TypeError):
1036                     self.report_warning('There are no annotations to write.')
1037                 except (OSError, IOError):
1038                     self.report_error('Cannot write annotations file: ' + annofn)
1039                     return
1040
1041         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1042                                        self.params.get('writeautomaticsub')])
1043
1044         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
1045             # subtitles download errors are already managed as troubles in relevant IE
1046             # that way it will silently go on when used with unsupporting IE
1047             subtitles = info_dict['subtitles']
1048             sub_format = self.params.get('subtitlesformat', 'srt')
1049             for sub_lang in subtitles.keys():
1050                 sub = subtitles[sub_lang]
1051                 if sub is None:
1052                     continue
1053                 try:
1054                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1055                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1056                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1057                     else:
1058                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1059                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1060                             subfile.write(sub)
1061                 except (OSError, IOError):
1062                     self.report_error('Cannot write subtitles file ' + sub_filename)
1063                     return
1064
1065         if self.params.get('writeinfojson', False):
1066             infofn = os.path.splitext(filename)[0] + '.info.json'
1067             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1068                 self.to_screen('[info] Video description metadata is already present')
1069             else:
1070                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1071                 try:
1072                     write_json_file(info_dict, infofn)
1073                 except (OSError, IOError):
1074                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1075                     return
1076
1077         if self.params.get('writethumbnail', False):
1078             if info_dict.get('thumbnail') is not None:
1079                 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
1080                 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
1081                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
1082                     self.to_screen('[%s] %s: Thumbnail is already present' %
1083                                    (info_dict['extractor'], info_dict['id']))
1084                 else:
1085                     self.to_screen('[%s] %s: Downloading thumbnail ...' %
1086                                    (info_dict['extractor'], info_dict['id']))
1087                     try:
1088                         uf = self.urlopen(info_dict['thumbnail'])
1089                         with open(thumb_filename, 'wb') as thumbf:
1090                             shutil.copyfileobj(uf, thumbf)
1091                         self.to_screen('[%s] %s: Writing thumbnail to: %s' %
1092                                        (info_dict['extractor'], info_dict['id'], thumb_filename))
1093                     except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1094                         self.report_warning('Unable to download thumbnail "%s": %s' %
1095                                             (info_dict['thumbnail'], compat_str(err)))
1096
1097         if not self.params.get('skip_download', False):
1098             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
1099                 success = True
1100             else:
1101                 try:
1102                     def dl(name, info):
1103                         fd = get_suitable_downloader(info)(self, self.params)
1104                         for ph in self._progress_hooks:
1105                             fd.add_progress_hook(ph)
1106                         if self.params.get('verbose'):
1107                             self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1108                         return fd.download(name, info)
1109                     if info_dict.get('requested_formats') is not None:
1110                         downloaded = []
1111                         success = True
1112                         merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
1113                         if not merger._executable:
1114                             postprocessors = []
1115                             self.report_warning('You have requested multiple '
1116                                                 'formats but ffmpeg or avconv are not installed.'
1117                                                 ' The formats won\'t be merged')
1118                         else:
1119                             postprocessors = [merger]
1120                         for f in info_dict['requested_formats']:
1121                             new_info = dict(info_dict)
1122                             new_info.update(f)
1123                             fname = self.prepare_filename(new_info)
1124                             fname = prepend_extension(fname, 'f%s' % f['format_id'])
1125                             downloaded.append(fname)
1126                             partial_success = dl(fname, new_info)
1127                             success = success and partial_success
1128                         info_dict['__postprocessors'] = postprocessors
1129                         info_dict['__files_to_merge'] = downloaded
1130                     else:
1131                         # Just a single file
1132                         success = dl(filename, info_dict)
1133                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1134                     self.report_error('unable to download video data: %s' % str(err))
1135                     return
1136                 except (OSError, IOError) as err:
1137                     raise UnavailableVideoError(err)
1138                 except (ContentTooShortError, ) as err:
1139                     self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1140                     return
1141
1142             if success:
1143                 try:
1144                     self.post_process(filename, info_dict)
1145                 except (PostProcessingError) as err:
1146                     self.report_error('postprocessing: %s' % str(err))
1147                     return
1148                 self.record_download_archive(info_dict)
1149
1150     def download(self, url_list):
1151         """Download a given list of URLs."""
1152         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1153         if (len(url_list) > 1 and
1154                 '%' not in outtmpl
1155                 and self.params.get('max_downloads') != 1):
1156             raise SameFileError(outtmpl)
1157
1158         for url in url_list:
1159             try:
1160                 # It also downloads the videos
1161                 res = self.extract_info(url)
1162             except UnavailableVideoError:
1163                 self.report_error('unable to download video')
1164             except MaxDownloadsReached:
1165                 self.to_screen('[info] Maximum number of downloaded files reached.')
1166                 raise
1167             else:
1168                 if self.params.get('dump_single_json', False):
1169                     self.to_stdout(json.dumps(res))
1170
1171         return self._download_retcode
1172
1173     def download_with_info_file(self, info_filename):
1174         with io.open(info_filename, 'r', encoding='utf-8') as f:
1175             info = json.load(f)
1176         try:
1177             self.process_ie_result(info, download=True)
1178         except DownloadError:
1179             webpage_url = info.get('webpage_url')
1180             if webpage_url is not None:
1181                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1182                 return self.download([webpage_url])
1183             else:
1184                 raise
1185         return self._download_retcode
1186
1187     def post_process(self, filename, ie_info):
1188         """Run all the postprocessors on the given file."""
1189         info = dict(ie_info)
1190         info['filepath'] = filename
1191         keep_video = None
1192         pps_chain = []
1193         if ie_info.get('__postprocessors') is not None:
1194             pps_chain.extend(ie_info['__postprocessors'])
1195         pps_chain.extend(self._pps)
1196         for pp in pps_chain:
1197             try:
1198                 keep_video_wish, new_info = pp.run(info)
1199                 if keep_video_wish is not None:
1200                     if keep_video_wish:
1201                         keep_video = keep_video_wish
1202                     elif keep_video is None:
1203                         # No clear decision yet, let IE decide
1204                         keep_video = keep_video_wish
1205             except PostProcessingError as e:
1206                 self.report_error(e.msg)
1207         if keep_video is False and not self.params.get('keepvideo', False):
1208             try:
1209                 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1210                 os.remove(encodeFilename(filename))
1211             except (IOError, OSError):
1212                 self.report_warning('Unable to remove downloaded video file')
1213
1214     def _make_archive_id(self, info_dict):
1215         # Future-proof against any change in case
1216         # and backwards compatibility with prior versions
1217         extractor = info_dict.get('extractor_key')
1218         if extractor is None:
1219             if 'id' in info_dict:
1220                 extractor = info_dict.get('ie_key')  # key in a playlist
1221         if extractor is None:
1222             return None  # Incomplete video information
1223         return extractor.lower() + ' ' + info_dict['id']
1224
1225     def in_download_archive(self, info_dict):
1226         fn = self.params.get('download_archive')
1227         if fn is None:
1228             return False
1229
1230         vid_id = self._make_archive_id(info_dict)
1231         if vid_id is None:
1232             return False  # Incomplete video information
1233
1234         try:
1235             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1236                 for line in archive_file:
1237                     if line.strip() == vid_id:
1238                         return True
1239         except IOError as ioe:
1240             if ioe.errno != errno.ENOENT:
1241                 raise
1242         return False
1243
1244     def record_download_archive(self, info_dict):
1245         fn = self.params.get('download_archive')
1246         if fn is None:
1247             return
1248         vid_id = self._make_archive_id(info_dict)
1249         assert vid_id
1250         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1251             archive_file.write(vid_id + '\n')
1252
1253     @staticmethod
1254     def format_resolution(format, default='unknown'):
1255         if format.get('vcodec') == 'none':
1256             return 'audio only'
1257         if format.get('resolution') is not None:
1258             return format['resolution']
1259         if format.get('height') is not None:
1260             if format.get('width') is not None:
1261                 res = '%sx%s' % (format['width'], format['height'])
1262             else:
1263                 res = '%sp' % format['height']
1264         elif format.get('width') is not None:
1265             res = '?x%d' % format['width']
1266         else:
1267             res = default
1268         return res
1269
1270     def _format_note(self, fdict):
1271         res = ''
1272         if fdict.get('ext') in ['f4f', 'f4m']:
1273             res += '(unsupported) '
1274         if fdict.get('format_note') is not None:
1275             res += fdict['format_note'] + ' '
1276         if fdict.get('tbr') is not None:
1277             res += '%4dk ' % fdict['tbr']
1278         if fdict.get('container') is not None:
1279             if res:
1280                 res += ', '
1281             res += '%s container' % fdict['container']
1282         if (fdict.get('vcodec') is not None and
1283                 fdict.get('vcodec') != 'none'):
1284             if res:
1285                 res += ', '
1286             res += fdict['vcodec']
1287             if fdict.get('vbr') is not None:
1288                 res += '@'
1289         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1290             res += 'video@'
1291         if fdict.get('vbr') is not None:
1292             res += '%4dk' % fdict['vbr']
1293         if fdict.get('fps') is not None:
1294             res += ', %sfps' % fdict['fps']
1295         if fdict.get('acodec') is not None:
1296             if res:
1297                 res += ', '
1298             if fdict['acodec'] == 'none':
1299                 res += 'video only'
1300             else:
1301                 res += '%-5s' % fdict['acodec']
1302         elif fdict.get('abr') is not None:
1303             if res:
1304                 res += ', '
1305             res += 'audio'
1306         if fdict.get('abr') is not None:
1307             res += '@%3dk' % fdict['abr']
1308         if fdict.get('asr') is not None:
1309             res += ' (%5dHz)' % fdict['asr']
1310         if fdict.get('filesize') is not None:
1311             if res:
1312                 res += ', '
1313             res += format_bytes(fdict['filesize'])
1314         elif fdict.get('filesize_approx') is not None:
1315             if res:
1316                 res += ', '
1317             res += '~' + format_bytes(fdict['filesize_approx'])
1318         return res
1319
1320     def list_formats(self, info_dict):
1321         def line(format, idlen=20):
1322             return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1323                 format['format_id'],
1324                 format['ext'],
1325                 self.format_resolution(format),
1326                 self._format_note(format),
1327             ))
1328
1329         formats = info_dict.get('formats', [info_dict])
1330         idlen = max(len('format code'),
1331                     max(len(f['format_id']) for f in formats))
1332         formats_s = [
1333             line(f, idlen) for f in formats
1334             if f.get('preference') is None or f['preference'] >= -1000]
1335         if len(formats) > 1:
1336             formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
1337             formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
1338
1339         header_line = line({
1340             'format_id': 'format code', 'ext': 'extension',
1341             'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1342         self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1343                        (info_dict['id'], header_line, '\n'.join(formats_s)))
1344
1345     def urlopen(self, req):
1346         """ Start an HTTP download """
1347
1348         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1349         # always respected by websites, some tend to give out URLs with non percent-encoded
1350         # non-ASCII characters (see telemb.py, ard.py [#3412])
1351         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1352         # To work around aforementioned issue we will replace request's original URL with
1353         # percent-encoded one
1354         req_is_string = isinstance(req, basestring if sys.version_info < (3, 0) else compat_str)
1355         url = req if req_is_string else req.get_full_url()
1356         url_escaped = escape_url(url)
1357
1358         # Substitute URL if any change after escaping
1359         if url != url_escaped:
1360             if req_is_string:
1361                 req = url_escaped
1362             else:
1363                 req = compat_urllib_request.Request(
1364                     url_escaped, data=req.data, headers=req.headers,
1365                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1366
1367         return self._opener.open(req, timeout=self._socket_timeout)
1368
1369     def print_debug_header(self):
1370         if not self.params.get('verbose'):
1371             return
1372
1373         if type('') is not compat_str:
1374             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1375             self.report_warning(
1376                 'Your Python is broken! Update to a newer and supported version')
1377
1378         stdout_encoding = getattr(
1379             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1380         encoding_str = (
1381             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1382                 locale.getpreferredencoding(),
1383                 sys.getfilesystemencoding(),
1384                 stdout_encoding,
1385                 self.get_encoding()))
1386         write_string(encoding_str, encoding=None)
1387
1388         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1389         try:
1390             sp = subprocess.Popen(
1391                 ['git', 'rev-parse', '--short', 'HEAD'],
1392                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1393                 cwd=os.path.dirname(os.path.abspath(__file__)))
1394             out, err = sp.communicate()
1395             out = out.decode().strip()
1396             if re.match('[0-9a-f]+', out):
1397                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1398         except:
1399             try:
1400                 sys.exc_clear()
1401             except:
1402                 pass
1403         self._write_string('[debug] Python version %s - %s\n' % (
1404             platform.python_version(), platform_name()))
1405
1406         exe_versions = FFmpegPostProcessor.get_versions()
1407         exe_versions['rtmpdump'] = rtmpdump_version()
1408         exe_str = ', '.join(
1409             '%s %s' % (exe, v)
1410             for exe, v in sorted(exe_versions.items())
1411             if v
1412         )
1413         if not exe_str:
1414             exe_str = 'none'
1415         self._write_string('[debug] exe versions: %s\n' % exe_str)
1416
1417         proxy_map = {}
1418         for handler in self._opener.handlers:
1419             if hasattr(handler, 'proxies'):
1420                 proxy_map.update(handler.proxies)
1421         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1422
1423     def _setup_opener(self):
1424         timeout_val = self.params.get('socket_timeout')
1425         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1426
1427         opts_cookiefile = self.params.get('cookiefile')
1428         opts_proxy = self.params.get('proxy')
1429
1430         if opts_cookiefile is None:
1431             self.cookiejar = compat_cookiejar.CookieJar()
1432         else:
1433             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1434                 opts_cookiefile)
1435             if os.access(opts_cookiefile, os.R_OK):
1436                 self.cookiejar.load()
1437
1438         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1439             self.cookiejar)
1440         if opts_proxy is not None:
1441             if opts_proxy == '':
1442                 proxies = {}
1443             else:
1444                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1445         else:
1446             proxies = compat_urllib_request.getproxies()
1447             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1448             if 'http' in proxies and 'https' not in proxies:
1449                 proxies['https'] = proxies['http']
1450         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1451
1452         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1453         https_handler = make_HTTPS_handler(
1454             self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1455         ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1456         opener = compat_urllib_request.build_opener(
1457             https_handler, proxy_handler, cookie_processor, ydlh)
1458         # Delete the default user-agent header, which would otherwise apply in
1459         # cases where our custom HTTP handler doesn't come into play
1460         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1461         opener.addheaders = []
1462         self._opener = opener
1463
1464     def encode(self, s):
1465         if isinstance(s, bytes):
1466             return s  # Already encoded
1467
1468         try:
1469             return s.encode(self.get_encoding())
1470         except UnicodeEncodeError as err:
1471             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
1472             raise
1473
1474     def get_encoding(self):
1475         encoding = self.params.get('encoding')
1476         if encoding is None:
1477             encoding = preferredencoding()
1478         return encoding