yt_dlp/YoutubeDL.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import contextlib
   8 import copy
   9 import datetime
  10 import errno
  11 import fileinput
  12 import io
  13 import itertools
  14 import json
  15 import locale
  16 import operator
  17 import os
  18 import platform
  19 import re
  20 import shutil
  21 import subprocess
  22 import sys
  23 import tempfile
  24 import time
  25 import tokenize
  26 import traceback
  27 import random
  28
  29 from string import ascii_letters
  30 from zipimport import zipimporter
  31
  32 from .compat import (
  33     compat_basestring,
  34     compat_get_terminal_size,
  35     compat_kwargs,
  36     compat_numeric_types,
  37     compat_os_name,
  38     compat_shlex_quote,
  39     compat_str,
  40     compat_tokenize_tokenize,
  41     compat_urllib_error,
  42     compat_urllib_request,
  43     compat_urllib_request_DataHandler,
  44 )
  45 from .cookies import load_cookies
  46 from .utils import (
  47     age_restricted,
  48     args_to_str,
  49     ContentTooShortError,
  50     date_from_str,
  51     DateRange,
  52     DEFAULT_OUTTMPL,
  53     determine_ext,
  54     determine_protocol,
  55     DOT_DESKTOP_LINK_TEMPLATE,
  56     DOT_URL_LINK_TEMPLATE,
  57     DOT_WEBLOC_LINK_TEMPLATE,
  58     DownloadError,
  59     encode_compat_str,
  60     encodeFilename,
  61     EntryNotInPlaylist,
  62     error_to_compat_str,
  63     ExistingVideoReached,
  64     expand_path,
  65     ExtractorError,
  66     float_or_none,
  67     format_bytes,
  68     format_field,
  69     STR_FORMAT_RE_TMPL,
  70     STR_FORMAT_TYPES,
  71     formatSeconds,
  72     GeoRestrictedError,
  73     HEADRequest,
  74     int_or_none,
  75     iri_to_uri,
  76     ISO3166Utils,
  77     LazyList,
  78     locked_file,
  79     make_dir,
  80     make_HTTPS_handler,
  81     MaxDownloadsReached,
  82     network_exceptions,
  83     orderedSet,
  84     OUTTMPL_TYPES,
  85     PagedList,
  86     parse_filesize,
  87     PerRequestProxyHandler,
  88     platform_name,
  89     PostProcessingError,
  90     preferredencoding,
  91     prepend_extension,
  92     process_communicate_or_kill,
  93     register_socks_protocols,
  94     RejectedVideoReached,
  95     render_table,
  96     replace_extension,
  97     SameFileError,
  98     sanitize_filename,
  99     sanitize_path,
 100     sanitize_url,
 101     sanitized_Request,
 102     std_headers,
 103     str_or_none,
 104     strftime_or_none,
 105     subtitles_filename,
 106     ThrottledDownload,
 107     to_high_limit_path,
 108     traverse_obj,
 109     try_get,
 110     UnavailableVideoError,
 111     url_basename,
 112     variadic,
 113     version_tuple,
 114     write_json_file,
 115     write_string,
 116     YoutubeDLCookieProcessor,
 117     YoutubeDLHandler,
 118     YoutubeDLRedirectHandler,
 119 )
 120 from .cache import Cache
 121 from .extractor import (
 122     gen_extractor_classes,
 123     get_info_extractor,
 124     _LAZY_LOADER,
 125     _PLUGIN_CLASSES
 126 )
 127 from .extractor.openload import PhantomJSwrapper
 128 from .downloader import (
 129     FFmpegFD,
 130     get_suitable_downloader,
 131     shorten_protocol_name
 132 )
 133 from .downloader.rtmp import rtmpdump_version
 134 from .postprocessor import (
 135     get_postprocessor,
 136     FFmpegFixupDurationPP,
 137     FFmpegFixupM3u8PP,
 138     FFmpegFixupM4aPP,
 139     FFmpegFixupStretchedPP,
 140     FFmpegFixupTimestampPP,
 141     FFmpegMergerPP,
 142     FFmpegPostProcessor,
 143     MoveFilesAfterDownloadPP,
 144 )
 145 from .version import __version__
 146
 147 if compat_os_name == 'nt':
 148     import ctypes
 149
 150
 151 class YoutubeDL(object):
 152     """YoutubeDL class.
 153
 154     YoutubeDL objects are the ones responsible of downloading the
 155     actual video file and writing it to disk if the user has requested
 156     it, among some other tasks. In most cases there should be one per
 157     program. As, given a video URL, the downloader doesn't know how to
 158     extract all the needed information, task that InfoExtractors do, it
 159     has to pass the URL to one of them.
 160
 161     For this, YoutubeDL objects have a method that allows
 162     InfoExtractors to be registered in a given order. When it is passed
 163     a URL, the YoutubeDL object handles it to the first InfoExtractor it
 164     finds that reports being able to handle it. The InfoExtractor extracts
 165     all the information about the video or videos the URL refers to, and
 166     YoutubeDL process the extracted information, possibly using a File
 167     Downloader to download the video.
 168
 169     YoutubeDL objects accept a lot of parameters. In order not to saturate
 170     the object constructor with arguments, it receives a dictionary of
 171     options instead. These options are available through the params
 172     attribute for the InfoExtractors to use. The YoutubeDL also
 173     registers itself as the downloader in charge for the InfoExtractors
 174     that are added to it, so this is a "mutual registration".
 175
 176     Available options:
 177
 178     username:          Username for authentication purposes.
 179     password:          Password for authentication purposes.
 180     videopassword:     Password for accessing a video.
 181     ap_mso:            Adobe Pass multiple-system operator identifier.
 182     ap_username:       Multiple-system operator account username.
 183     ap_password:       Multiple-system operator account password.
 184     usenetrc:          Use netrc for authentication instead.
 185     verbose:           Print additional info to stdout.
 186     quiet:             Do not print messages to stdout.
 187     no_warnings:       Do not print out anything for warnings.
 188     forceprint:        A list of templates to force print
 189     forceurl:          Force printing final URL. (Deprecated)
 190     forcetitle:        Force printing title. (Deprecated)
 191     forceid:           Force printing ID. (Deprecated)
 192     forcethumbnail:    Force printing thumbnail URL. (Deprecated)
 193     forcedescription:  Force printing description. (Deprecated)
 194     forcefilename:     Force printing final filename. (Deprecated)
 195     forceduration:     Force printing duration. (Deprecated)
 196     forcejson:         Force printing info_dict as JSON.
 197     dump_single_json:  Force printing the info_dict of the whole playlist
 198                        (or video) as a single JSON line.
 199     force_write_download_archive: Force writing download archive regardless
 200                        of 'skip_download' or 'simulate'.
 201     simulate:          Do not download the video files.
 202     format:            Video format code. see "FORMAT SELECTION" for more details.
 203     allow_unplayable_formats:   Allow unplayable formats to be extracted and downloaded.
 204     ignore_no_formats_error: Ignore "No video formats" error. Usefull for
 205                        extracting metadata even if the video is not actually
 206                        available for download (experimental)
 207     format_sort:       How to sort the video formats. see "Sorting Formats"
 208                        for more details.
 209     format_sort_force: Force the given format_sort. see "Sorting Formats"
 210                        for more details.
 211     allow_multiple_video_streams:   Allow multiple video streams to be merged
 212                        into a single file
 213     allow_multiple_audio_streams:   Allow multiple audio streams to be merged
 214                        into a single file
 215     check_formats      Whether to test if the formats are downloadable.
 216                        Can be True (check all), False (check none)
 217                        or None (check only if requested by extractor)
 218     paths:             Dictionary of output paths. The allowed keys are 'home'
 219                        'temp' and the keys of OUTTMPL_TYPES (in utils.py)
 220     outtmpl:           Dictionary of templates for output names. Allowed keys
 221                        are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
 222                        A string a also accepted for backward compatibility
 223     outtmpl_na_placeholder: Placeholder for unavailable meta fields.
 224     restrictfilenames: Do not allow "&" and spaces in file names
 225     trim_file_name:    Limit length of filename (extension excluded)
 226     windowsfilenames:  Force the filenames to be windows compatible
 227     ignoreerrors:      Do not stop on download errors
 228                        (Default True when running yt-dlp,
 229                        but False when directly accessing YoutubeDL class)
 230     skip_playlist_after_errors: Number of allowed failures until the rest of
 231                        the playlist is skipped
 232     force_generic_extractor: Force downloader to use the generic extractor
 233     overwrites:        Overwrite all video and metadata files if True,
 234                        overwrite only non-video files if None
 235                        and don't overwrite any file if False
 236     playliststart:     Playlist item to start at.
 237     playlistend:       Playlist item to end at.
 238     playlist_items:    Specific indices of playlist to download.
 239     playlistreverse:   Download playlist items in reverse order.
 240     playlistrandom:    Download playlist items in random order.
 241     matchtitle:        Download only matching titles.
 242     rejecttitle:       Reject downloads for matching titles.
 243     logger:            Log messages to a logging.Logger instance.
 244     logtostderr:       Log messages to stderr instead of stdout.
 245     writedescription:  Write the video description to a .description file
 246     writeinfojson:     Write the video description to a .info.json file
 247     clean_infojson:    Remove private fields from the infojson
 248     writecomments:     Extract video comments. This will not be written to disk
 249                        unless writeinfojson is also given
 250     writeannotations:  Write the video annotations to a .annotations.xml file
 251     writethumbnail:    Write the thumbnail image to a file
 252     allow_playlist_files: Whether to write playlists' description, infojson etc
 253                        also to disk when using the 'write*' options
 254     write_all_thumbnails:  Write all thumbnail formats to files
 255     writelink:         Write an internet shortcut file, depending on the
 256                        current platform (.url/.webloc/.desktop)
 257     writeurllink:      Write a Windows internet shortcut file (.url)
 258     writewebloclink:   Write a macOS internet shortcut file (.webloc)
 259     writedesktoplink:  Write a Linux internet shortcut file (.desktop)
 260     writesubtitles:    Write the video subtitles to a file
 261     writeautomaticsub: Write the automatically generated subtitles to a file
 262     allsubtitles:      Deprecated - Use subtitleslangs = ['all']
 263                        Downloads all the subtitles of the video
 264                        (requires writesubtitles or writeautomaticsub)
 265     listsubtitles:     Lists all available subtitles for the video
 266     subtitlesformat:   The format code for subtitles
 267     subtitleslangs:    List of languages of the subtitles to download (can be regex).
 268                        The list may contain "all" to refer to all the available
 269                        subtitles. The language can be prefixed with a "-" to
 270                        exclude it from the requested languages. Eg: ['all', '-live_chat']
 271     keepvideo:         Keep the video file after post-processing
 272     daterange:         A DateRange object, download only if the upload_date is in the range.
 273     skip_download:     Skip the actual download of the video file
 274     cachedir:          Location of the cache files in the filesystem.
 275                        False to disable filesystem cache.
 276     noplaylist:        Download single video instead of a playlist if in doubt.
 277     age_limit:         An integer representing the user's age in years.
 278                        Unsuitable videos for the given age are skipped.
 279     min_views:         An integer representing the minimum view count the video
 280                        must have in order to not be skipped.
 281                        Videos without view count information are always
 282                        downloaded. None for no limit.
 283     max_views:         An integer representing the maximum view count.
 284                        Videos that are more popular than that are not
 285                        downloaded.
 286                        Videos without view count information are always
 287                        downloaded. None for no limit.
 288     download_archive:  File name of a file where all downloads are recorded.
 289                        Videos already present in the file are not downloaded
 290                        again.
 291     break_on_existing: Stop the download process after attempting to download a
 292                        file that is in the archive.
 293     break_on_reject:   Stop the download process when encountering a video that
 294                        has been filtered out.
 295     cookiefile:        File name where cookies should be read from and dumped to
 296     cookiesfrombrowser: A tuple containing the name of the browser and the profile
 297                        name/path from where cookies are loaded.
 298                        Eg: ('chrome', ) or (vivaldi, 'default')
 299     nocheckcertificate:Do not verify SSL certificates
 300     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 301                        At the moment, this is only supported by YouTube.
 302     proxy:             URL of the proxy server to use
 303     geo_verification_proxy:  URL of the proxy to use for IP address verification
 304                        on geo-restricted sites.
 305     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 306     bidi_workaround:   Work around buggy terminals without bidirectional text
 307                        support, using fridibi
 308     debug_printtraffic:Print out sent and received HTTP traffic
 309     include_ads:       Download ads as well
 310     default_search:    Prepend this string if an input url is not valid.
 311                        'auto' for elaborate guessing
 312     encoding:          Use this encoding instead of the system-specified.
 313     extract_flat:      Do not resolve URLs, return the immediate result.
 314                        Pass in 'in_playlist' to only show this behavior for
 315                        playlist items.
 316     postprocessors:    A list of dictionaries, each with an entry
 317                        * key:  The name of the postprocessor. See
 318                                yt_dlp/postprocessor/__init__.py for a list.
 319                        * when: When to run the postprocessor. Can be one of
 320                                pre_process|before_dl|post_process|after_move.
 321                                Assumed to be 'post_process' if not given
 322     post_hooks:        A list of functions that get called as the final step
 323                        for each video file, after all postprocessors have been
 324                        called. The filename will be passed as the only argument.
 325     progress_hooks:    A list of functions that get called on download
 326                        progress, with a dictionary with the entries
 327                        * status: One of "downloading", "error", or "finished".
 328                                  Check this first and ignore unknown values.
 329                        * info_dict: The extracted info_dict
 330
 331                        If status is one of "downloading", or "finished", the
 332                        following properties may also be present:
 333                        * filename: The final filename (always present)
 334                        * tmpfilename: The filename we're currently writing to
 335                        * downloaded_bytes: Bytes on disk
 336                        * total_bytes: Size of the whole file, None if unknown
 337                        * total_bytes_estimate: Guess of the eventual file size,
 338                                                None if unavailable.
 339                        * elapsed: The number of seconds since download started.
 340                        * eta: The estimated time in seconds, None if unknown
 341                        * speed: The download speed in bytes/second, None if
 342                                 unknown
 343                        * fragment_index: The counter of the currently
 344                                          downloaded video fragment.
 345                        * fragment_count: The number of fragments (= individual
 346                                          files that will be merged)
 347
 348                        Progress hooks are guaranteed to be called at least once
 349                        (with status "finished") if the download is successful.
 350     merge_output_format: Extension to use when merging formats.
 351     final_ext:         Expected final extension; used to detect when the file was
 352                        already downloaded and converted. "merge_output_format" is
 353                        replaced by this extension when given
 354     fixup:             Automatically correct known faults of the file.
 355                        One of:
 356                        - "never": do nothing
 357                        - "warn": only emit a warning
 358                        - "detect_or_warn": check whether we can do anything
 359                                            about it, warn otherwise (default)
 360     source_address:    Client-side IP address to bind to.
 361     call_home:         Boolean, true iff we are allowed to contact the
 362                        yt-dlp servers for debugging. (BROKEN)
 363     sleep_interval_requests: Number of seconds to sleep between requests
 364                        during extraction
 365     sleep_interval:    Number of seconds to sleep before each download when
 366                        used alone or a lower bound of a range for randomized
 367                        sleep before each download (minimum possible number
 368                        of seconds to sleep) when used along with
 369                        max_sleep_interval.
 370     max_sleep_interval:Upper bound of a range for randomized sleep before each
 371                        download (maximum possible number of seconds to sleep).
 372                        Must only be used along with sleep_interval.
 373                        Actual sleep time will be a random float from range
 374                        [sleep_interval; max_sleep_interval].
 375     sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
 376     listformats:       Print an overview of available video formats and exit.
 377     list_thumbnails:   Print a table of all thumbnails and exit.
 378     match_filter:      A function that gets called with the info_dict of
 379                        every video.
 380                        If it returns a message, the video is ignored.
 381                        If it returns None, the video is downloaded.
 382                        match_filter_func in utils.py is one example for this.
 383     no_color:          Do not emit color codes in output.
 384     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
 385                        HTTP header
 386     geo_bypass_country:
 387                        Two-letter ISO 3166-2 country code that will be used for
 388                        explicit geographic restriction bypassing via faking
 389                        X-Forwarded-For HTTP header
 390     geo_bypass_ip_block:
 391                        IP range in CIDR notation that will be used similarly to
 392                        geo_bypass_country
 393
 394     The following options determine which downloader is picked:
 395     external_downloader: A dictionary of protocol keys and the executable of the
 396                        external downloader to use for it. The allowed protocols
 397                        are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
 398                        Set the value to 'native' to use the native downloader
 399     hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
 400                        or {'m3u8': 'ffmpeg'} instead.
 401                        Use the native HLS downloader instead of ffmpeg/avconv
 402                        if True, otherwise use ffmpeg/avconv if False, otherwise
 403                        use downloader suggested by extractor if None.
 404     compat_opts:       Compatibility options. See "Differences in default behavior".
 405                        The following options do not work when used through the API:
 406                        filename, abort-on-error, multistreams, no-live-chat,
 407                        no-clean-infojson, no-playlist-metafiles.
 408                        Refer __init__.py for their implementation
 409
 410     The following parameters are not used by YoutubeDL itself, they are used by
 411     the downloader (see yt_dlp/downloader/common.py):
 412     nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
 413     max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle,
 414     xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size.
 415
 416     The following options are used by the post processors:
 417     prefer_ffmpeg:     If False, use avconv instead of ffmpeg if both are available,
 418                        otherwise prefer ffmpeg. (avconv support is deprecated)
 419     ffmpeg_location:   Location of the ffmpeg/avconv binary; either the path
 420                        to the binary or its containing directory.
 421     postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
 422                         and a list of additional command-line arguments for the
 423                         postprocessor/executable. The dict can also have "PP+EXE" keys
 424                         which are used when the given exe is used by the given PP.
 425                         Use 'default' as the name for arguments to passed to all PP
 426
 427     The following options are used by the extractors:
 428     extractor_retries: Number of times to retry for known errors
 429     dynamic_mpd:       Whether to process dynamic DASH manifests (default: True)
 430     hls_split_discontinuity: Split HLS playlists to different formats at
 431                        discontinuities such as ad breaks (default: False)
 432     extractor_args:    A dictionary of arguments to be passed to the extractors.
 433                        See "EXTRACTOR ARGUMENTS" for details.
 434                        Eg: {'youtube': {'skip': ['dash', 'hls']}}
 435     youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
 436                        If True (default), DASH manifests and related
 437                        data will be downloaded and processed by extractor.
 438                        You can reduce network I/O by disabling it if you don't
 439                        care about DASH. (only for youtube)
 440     youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
 441                        If True (default), HLS manifests and related
 442                        data will be downloaded and processed by extractor.
 443                        You can reduce network I/O by disabling it if you don't
 444                        care about HLS. (only for youtube)
 445     """
 446
 447     _NUMERIC_FIELDS = set((
 448         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
 449         'timestamp', 'upload_year', 'upload_month', 'upload_day',
 450         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
 451         'average_rating', 'comment_count', 'age_limit',
 452         'start_time', 'end_time',
 453         'chapter_number', 'season_number', 'episode_number',
 454         'track_number', 'disc_number', 'release_year',
 455         'playlist_index',
 456     ))
 457
 458     params = None
 459     _ies = []
 460     _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
 461     _printed_messages = set()
 462     _first_webpage_request = True
 463     _download_retcode = None
 464     _num_downloads = None
 465     _playlist_level = 0
 466     _playlist_urls = set()
 467     _screen_file = None
 468
 469     def __init__(self, params=None, auto_init=True):
 470         """Create a FileDownloader object with the given options."""
 471         if params is None:
 472             params = {}
 473         self._ies = []
 474         self._ies_instances = {}
 475         self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
 476         self._printed_messages = set()
 477         self._first_webpage_request = True
 478         self._post_hooks = []
 479         self._progress_hooks = []
 480         self._download_retcode = 0
 481         self._num_downloads = 0
 482         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 483         self._err_file = sys.stderr
 484         self.params = {
 485             # Default parameters
 486             'nocheckcertificate': False,
 487         }
 488         self.params.update(params)
 489         self.cache = Cache(self)
 490
 491         if sys.version_info < (3, 6):
 492             self.report_warning(
 493                 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
 494
 495         def check_deprecated(param, option, suggestion):
 496             if self.params.get(param) is not None:
 497                 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
 498                 return True
 499             return False
 500
 501         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
 502             if self.params.get('geo_verification_proxy') is None:
 503                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
 504
 505         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
 506         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
 507         check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
 508
 509         for msg in self.params.get('warnings', []):
 510             self.report_warning(msg)
 511
 512         if self.params.get('final_ext'):
 513             if self.params.get('merge_output_format'):
 514                 self.report_warning('--merge-output-format will be ignored since --remux-video or --recode-video is given')
 515             self.params['merge_output_format'] = self.params['final_ext']
 516
 517         if 'overwrites' in self.params and self.params['overwrites'] is None:
 518             del self.params['overwrites']
 519
 520         if params.get('bidi_workaround', False):
 521             try:
 522                 import pty
 523                 master, slave = pty.openpty()
 524                 width = compat_get_terminal_size().columns
 525                 if width is None:
 526                     width_args = []
 527                 else:
 528                     width_args = ['-w', str(width)]
 529                 sp_kwargs = dict(
 530                     stdin=subprocess.PIPE,
 531                     stdout=slave,
 532                     stderr=self._err_file)
 533                 try:
 534                     self._output_process = subprocess.Popen(
 535                         ['bidiv'] + width_args, **sp_kwargs
 536                     )
 537                 except OSError:
 538                     self._output_process = subprocess.Popen(
 539                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 540                 self._output_channel = os.fdopen(master, 'rb')
 541             except OSError as ose:
 542                 if ose.errno == errno.ENOENT:
 543                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 544                 else:
 545                     raise
 546
 547         if (sys.platform != 'win32'
 548                 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 549                 and not params.get('restrictfilenames', False)):
 550             # Unicode filesystem API will throw errors (#1474, #13027)
 551             self.report_warning(
 552                 'Assuming --restrict-filenames since file system encoding '
 553                 'cannot encode all characters. '
 554                 'Set the LC_ALL environment variable to fix this.')
 555             self.params['restrictfilenames'] = True
 556
 557         self.outtmpl_dict = self.parse_outtmpl()
 558
 559         # Creating format selector here allows us to catch syntax errors before the extraction
 560         self.format_selector = (
 561             None if self.params.get('format') is None
 562             else self.build_format_selector(self.params['format']))
 563
 564         self._setup_opener()
 565
 566         """Preload the archive, if any is specified"""
 567         def preload_download_archive(fn):
 568             if fn is None:
 569                 return False
 570             self.write_debug('Loading archive file %r\n' % fn)
 571             try:
 572                 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
 573                     for line in archive_file:
 574                         self.archive.add(line.strip())
 575             except IOError as ioe:
 576                 if ioe.errno != errno.ENOENT:
 577                     raise
 578                 return False
 579             return True
 580
 581         self.archive = set()
 582         preload_download_archive(self.params.get('download_archive'))
 583
 584         if auto_init:
 585             self.print_debug_header()
 586             self.add_default_info_extractors()
 587
 588         for pp_def_raw in self.params.get('postprocessors', []):
 589             pp_def = dict(pp_def_raw)
 590             when = pp_def.pop('when', 'post_process')
 591             pp_class = get_postprocessor(pp_def.pop('key'))
 592             pp = pp_class(self, **compat_kwargs(pp_def))
 593             self.add_post_processor(pp, when=when)
 594
 595         for ph in self.params.get('post_hooks', []):
 596             self.add_post_hook(ph)
 597
 598         for ph in self.params.get('progress_hooks', []):
 599             self.add_progress_hook(ph)
 600
 601         register_socks_protocols()
 602
 603     def warn_if_short_id(self, argv):
 604         # short YouTube ID starting with dash?
 605         idxs = [
 606             i for i, a in enumerate(argv)
 607             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 608         if idxs:
 609             correct_argv = (
 610                 ['yt-dlp']
 611                 + [a for i, a in enumerate(argv) if i not in idxs]
 612                 + ['--'] + [argv[i] for i in idxs]
 613             )
 614             self.report_warning(
 615                 'Long argument string detected. '
 616                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 617                 args_to_str(correct_argv))
 618
 619     def add_info_extractor(self, ie):
 620         """Add an InfoExtractor object to the end of the list."""
 621         self._ies.append(ie)
 622         if not isinstance(ie, type):
 623             self._ies_instances[ie.ie_key()] = ie
 624             ie.set_downloader(self)
 625
 626     def get_info_extractor(self, ie_key):
 627         """
 628         Get an instance of an IE with name ie_key, it will try to get one from
 629         the _ies list, if there's no instance it will create a new one and add
 630         it to the extractor list.
 631         """
 632         ie = self._ies_instances.get(ie_key)
 633         if ie is None:
 634             ie = get_info_extractor(ie_key)()
 635             self.add_info_extractor(ie)
 636         return ie
 637
 638     def add_default_info_extractors(self):
 639         """
 640         Add the InfoExtractors returned by gen_extractors to the end of the list
 641         """
 642         for ie in gen_extractor_classes():
 643             self.add_info_extractor(ie)
 644
 645     def add_post_processor(self, pp, when='post_process'):
 646         """Add a PostProcessor object to the end of the chain."""
 647         self._pps[when].append(pp)
 648         pp.set_downloader(self)
 649
 650     def add_post_hook(self, ph):
 651         """Add the post hook"""
 652         self._post_hooks.append(ph)
 653
 654     def add_progress_hook(self, ph):
 655         """Add the progress hook (currently only for the file downloader)"""
 656         self._progress_hooks.append(ph)
 657
 658     def _bidi_workaround(self, message):
 659         if not hasattr(self, '_output_channel'):
 660             return message
 661
 662         assert hasattr(self, '_output_process')
 663         assert isinstance(message, compat_str)
 664         line_count = message.count('\n') + 1
 665         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 666         self._output_process.stdin.flush()
 667         res = ''.join(self._output_channel.readline().decode('utf-8')
 668                       for _ in range(line_count))
 669         return res[:-len('\n')]
 670
 671     def _write_string(self, message, out=None, only_once=False):
 672         if only_once:
 673             if message in self._printed_messages:
 674                 return
 675             self._printed_messages.add(message)
 676         write_string(message, out=out, encoding=self.params.get('encoding'))
 677
 678     def to_stdout(self, message, skip_eol=False, quiet=False):
 679         """Print message to stdout"""
 680         if self.params.get('logger'):
 681             self.params['logger'].debug(message)
 682         elif not quiet or self.params.get('verbose'):
 683             self._write_string(
 684                 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
 685                 self._err_file if quiet else self._screen_file)
 686
 687     def to_stderr(self, message, only_once=False):
 688         """Print message to stderr"""
 689         assert isinstance(message, compat_str)
 690         if self.params.get('logger'):
 691             self.params['logger'].error(message)
 692         else:
 693             self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
 694
 695     def to_console_title(self, message):
 696         if not self.params.get('consoletitle', False):
 697             return
 698         if compat_os_name == 'nt':
 699             if ctypes.windll.kernel32.GetConsoleWindow():
 700                 # c_wchar_p() might not be necessary if `message` is
 701                 # already of type unicode()
 702                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 703         elif 'TERM' in os.environ:
 704             self._write_string('\033]0;%s\007' % message, self._screen_file)
 705
 706     def save_console_title(self):
 707         if not self.params.get('consoletitle', False):
 708             return
 709         if self.params.get('simulate', False):
 710             return
 711         if compat_os_name != 'nt' and 'TERM' in os.environ:
 712             # Save the title on stack
 713             self._write_string('\033[22;0t', self._screen_file)
 714
 715     def restore_console_title(self):
 716         if not self.params.get('consoletitle', False):
 717             return
 718         if self.params.get('simulate', False):
 719             return
 720         if compat_os_name != 'nt' and 'TERM' in os.environ:
 721             # Restore the title from stack
 722             self._write_string('\033[23;0t', self._screen_file)
 723
 724     def __enter__(self):
 725         self.save_console_title()
 726         return self
 727
 728     def __exit__(self, *args):
 729         self.restore_console_title()
 730
 731         if self.params.get('cookiefile') is not None:
 732             self.cookiejar.save(ignore_discard=True, ignore_expires=True)
 733
 734     def trouble(self, message=None, tb=None):
 735         """Determine action to take when a download problem appears.
 736
 737         Depending on if the downloader has been configured to ignore
 738         download errors or not, this method may throw an exception or
 739         not when errors are found, after printing the message.
 740
 741         tb, if given, is additional traceback information.
 742         """
 743         if message is not None:
 744             self.to_stderr(message)
 745         if self.params.get('verbose'):
 746             if tb is None:
 747                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 748                     tb = ''
 749                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 750                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 751                     tb += encode_compat_str(traceback.format_exc())
 752                 else:
 753                     tb_data = traceback.format_list(traceback.extract_stack())
 754                     tb = ''.join(tb_data)
 755             if tb:
 756                 self.to_stderr(tb)
 757         if not self.params.get('ignoreerrors', False):
 758             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 759                 exc_info = sys.exc_info()[1].exc_info
 760             else:
 761                 exc_info = sys.exc_info()
 762             raise DownloadError(message, exc_info)
 763         self._download_retcode = 1
 764
 765     def to_screen(self, message, skip_eol=False):
 766         """Print message to stdout if not in quiet mode"""
 767         self.to_stdout(
 768             message, skip_eol, quiet=self.params.get('quiet', False))
 769
 770     def report_warning(self, message, only_once=False):
 771         '''
 772         Print the message to stderr, it will be prefixed with 'WARNING:'
 773         If stderr is a tty file the 'WARNING:' will be colored
 774         '''
 775         if self.params.get('logger') is not None:
 776             self.params['logger'].warning(message)
 777         else:
 778             if self.params.get('no_warnings'):
 779                 return
 780             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 781                 _msg_header = '\033[0;33mWARNING:\033[0m'
 782             else:
 783                 _msg_header = 'WARNING:'
 784             warning_message = '%s %s' % (_msg_header, message)
 785             self.to_stderr(warning_message, only_once)
 786
 787     def report_error(self, message, tb=None):
 788         '''
 789         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 790         in red if stderr is a tty file.
 791         '''
 792         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 793             _msg_header = '\033[0;31mERROR:\033[0m'
 794         else:
 795             _msg_header = 'ERROR:'
 796         error_message = '%s %s' % (_msg_header, message)
 797         self.trouble(error_message, tb)
 798
 799     def write_debug(self, message, only_once=False):
 800         '''Log debug message or Print message to stderr'''
 801         if not self.params.get('verbose', False):
 802             return
 803         message = '[debug] %s' % message
 804         if self.params.get('logger'):
 805             self.params['logger'].debug(message)
 806         else:
 807             self.to_stderr(message, only_once)
 808
 809     def report_file_already_downloaded(self, file_name):
 810         """Report file has already been fully downloaded."""
 811         try:
 812             self.to_screen('[download] %s has already been downloaded' % file_name)
 813         except UnicodeEncodeError:
 814             self.to_screen('[download] The file has already been downloaded')
 815
 816     def report_file_delete(self, file_name):
 817         """Report that existing file will be deleted."""
 818         try:
 819             self.to_screen('Deleting existing file %s' % file_name)
 820         except UnicodeEncodeError:
 821             self.to_screen('Deleting existing file')
 822
 823     def parse_outtmpl(self):
 824         outtmpl_dict = self.params.get('outtmpl', {})
 825         if not isinstance(outtmpl_dict, dict):
 826             outtmpl_dict = {'default': outtmpl_dict}
 827         outtmpl_dict.update({
 828             k: v for k, v in DEFAULT_OUTTMPL.items()
 829             if not outtmpl_dict.get(k)})
 830         for key, val in outtmpl_dict.items():
 831             if isinstance(val, bytes):
 832                 self.report_warning(
 833                     'Parameter outtmpl is bytes, but should be a unicode string. '
 834                     'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
 835         return outtmpl_dict
 836
 837     def get_output_path(self, dir_type='', filename=None):
 838         paths = self.params.get('paths', {})
 839         assert isinstance(paths, dict)
 840         path = os.path.join(
 841             expand_path(paths.get('home', '').strip()),
 842             expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
 843             filename or '')
 844
 845         # Temporary fix for #4787
 846         # 'Treat' all problem characters by passing filename through preferredencoding
 847         # to workaround encoding issues with subprocess on python2 @ Windows
 848         if sys.version_info < (3, 0) and sys.platform == 'win32':
 849             path = encodeFilename(path, True).decode(preferredencoding())
 850         return sanitize_path(path, force=self.params.get('windowsfilenames'))
 851
 852     @staticmethod
 853     def _outtmpl_expandpath(outtmpl):
 854         # expand_path translates '%%' into '%' and '$$' into '$'
 855         # correspondingly that is not what we want since we need to keep
 856         # '%%' intact for template dict substitution step. Working around
 857         # with boundary-alike separator hack.
 858         sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
 859         outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
 860
 861         # outtmpl should be expand_path'ed before template dict substitution
 862         # because meta fields may contain env variables we don't want to
 863         # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
 864         # title "Hello $PATH", we don't want `$PATH` to be expanded.
 865         return expand_path(outtmpl).replace(sep, '')
 866
 867     @staticmethod
 868     def escape_outtmpl(outtmpl):
 869         ''' Escape any remaining strings like %s, %abc% etc. '''
 870         return re.sub(
 871             STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
 872             lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
 873             outtmpl)
 874
 875     @classmethod
 876     def validate_outtmpl(cls, outtmpl):
 877         ''' @return None or Exception object '''
 878         outtmpl = re.sub(
 879             STR_FORMAT_RE_TMPL.format('[^)]*', '[ljq]'),
 880             lambda mobj: f'{mobj.group(0)[:-1]}s',
 881             cls._outtmpl_expandpath(outtmpl))
 882         try:
 883             cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
 884             return None
 885         except ValueError as err:
 886             return err
 887
 888     def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
 889         """ Make the template and info_dict suitable for substitution : ydl.outtmpl_escape(outtmpl) % info_dict """
 890         info_dict = dict(info_dict)
 891         na = self.params.get('outtmpl_na_placeholder', 'NA')
 892
 893         info_dict['duration_string'] = (  # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
 894             formatSeconds(info_dict['duration'], '-' if sanitize else ':')
 895             if info_dict.get('duration', None) is not None
 896             else None)
 897         info_dict['epoch'] = int(time.time())
 898         info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
 899         if info_dict.get('resolution') is None:
 900             info_dict['resolution'] = self.format_resolution(info_dict, default=None)
 901
 902         # For fields playlist_index and autonumber convert all occurrences
 903         # of %(field)s to %(field)0Nd for backward compatibility
 904         field_size_compat_map = {
 905             'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')),
 906             'autonumber': self.params.get('autonumber_size') or 5,
 907         }
 908
 909         TMPL_DICT = {}
 910         EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljq]'))
 911         MATH_FUNCTIONS = {
 912             '+': float.__add__,
 913             '-': float.__sub__,
 914         }
 915         # Field is of the form key1.key2...
 916         # where keys (except first) can be string, int or slice
 917         FIELD_RE = r'\w+(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
 918         MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
 919         MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
 920         INTERNAL_FORMAT_RE = re.compile(r'''(?x)
 921             (?P<negate>-)?
 922             (?P<fields>{field})
 923             (?P<maths>(?:{math_op}{math_field})*)
 924             (?:>(?P<strf_format>.+?))?
 925             (?:\|(?P<default>.*?))?
 926             $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
 927
 928         get_key = lambda k: traverse_obj(
 929             info_dict, k.split('.'), is_user_input=True, traverse_string=True)
 930
 931         def get_value(mdict):
 932             # Object traversal
 933             value = get_key(mdict['fields'])
 934             # Negative
 935             if mdict['negate']:
 936                 value = float_or_none(value)
 937                 if value is not None:
 938                     value *= -1
 939             # Do maths
 940             offset_key = mdict['maths']
 941             if offset_key:
 942                 value = float_or_none(value)
 943                 operator = None
 944                 while offset_key:
 945                     item = re.match(
 946                         MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
 947                         offset_key).group(0)
 948                     offset_key = offset_key[len(item):]
 949                     if operator is None:
 950                         operator = MATH_FUNCTIONS[item]
 951                         continue
 952                     item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
 953                     offset = float_or_none(item)
 954                     if offset is None:
 955                         offset = float_or_none(get_key(item))
 956                     try:
 957                         value = operator(value, multiplier * offset)
 958                     except (TypeError, ZeroDivisionError):
 959                         return None
 960                     operator = None
 961             # Datetime formatting
 962             if mdict['strf_format']:
 963                 value = strftime_or_none(value, mdict['strf_format'])
 964
 965             return value
 966
 967         def create_key(outer_mobj):
 968             if not outer_mobj.group('has_key'):
 969                 return f'%{outer_mobj.group(0)}'
 970
 971             prefix = outer_mobj.group('prefix')
 972             key = outer_mobj.group('key')
 973             original_fmt = fmt = outer_mobj.group('format')
 974             mobj = re.match(INTERNAL_FORMAT_RE, key)
 975             if mobj is None:
 976                 value, default, mobj = None, na, {'fields': ''}
 977             else:
 978                 mobj = mobj.groupdict()
 979                 default = mobj['default'] if mobj['default'] is not None else na
 980                 value = get_value(mobj)
 981
 982             if fmt == 's' and value is not None and key in field_size_compat_map.keys():
 983                 fmt = '0{:d}d'.format(field_size_compat_map[key])
 984
 985             value = default if value is None else value
 986
 987             str_fmt = f'{fmt[:-1]}s'
 988             if fmt[-1] == 'l':
 989                 value, fmt = ', '.join(variadic(value)), str_fmt
 990             elif fmt[-1] == 'j':
 991                 value, fmt = json.dumps(value), str_fmt
 992             elif fmt[-1] == 'q':
 993                 value, fmt = compat_shlex_quote(str(value)), str_fmt
 994             elif fmt[-1] == 'c':
 995                 value = str(value)
 996                 if value is None:
 997                     value, fmt = default, 's'
 998                 else:
 999                     value = value[0]
1000             elif fmt[-1] not in 'rs':  # numeric
1001                 value = float_or_none(value)
1002                 if value is None:
1003                     value, fmt = default, 's'
1004
1005             if sanitize:
1006                 if fmt[-1] == 'r':
1007                     # If value is an object, sanitize might convert it to a string
1008                     # So we convert it to repr first
1009                     value, fmt = repr(value), str_fmt
1010                 if fmt[-1] in 'csr':
1011                     value = sanitize(mobj['fields'].split('.')[-1], value)
1012
1013             key = '%s\0%s' % (key.replace('%', '%\0'), original_fmt)
1014             TMPL_DICT[key] = value
1015             return f'{prefix}%({key}){fmt}'
1016
1017         return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1018
1019     def _prepare_filename(self, info_dict, tmpl_type='default'):
1020         try:
1021             sanitize = lambda k, v: sanitize_filename(
1022                 compat_str(v),
1023                 restricted=self.params.get('restrictfilenames'),
1024                 is_id=(k == 'id' or k.endswith('_id')))
1025             outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
1026             outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
1027             outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl))
1028             filename = outtmpl % template_dict
1029
1030             force_ext = OUTTMPL_TYPES.get(tmpl_type)
1031             if force_ext is not None:
1032                 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1033
1034             # https://github.com/blackjack4494/youtube-dlc/issues/85
1035             trim_file_name = self.params.get('trim_file_name', False)
1036             if trim_file_name:
1037                 fn_groups = filename.rsplit('.')
1038                 ext = fn_groups[-1]
1039                 sub_ext = ''
1040                 if len(fn_groups) > 2:
1041                     sub_ext = fn_groups[-2]
1042                 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
1043
1044             return filename
1045         except ValueError as err:
1046             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1047             return None
1048
1049     def prepare_filename(self, info_dict, dir_type='', warn=False):
1050         """Generate the output filename."""
1051
1052         filename = self._prepare_filename(info_dict, dir_type or 'default')
1053
1054         if warn:
1055             if not self.params.get('paths'):
1056                 pass
1057             elif filename == '-':
1058                 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1059             elif os.path.isabs(filename):
1060                 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1061             self.__prepare_filename_warned = True
1062         if filename == '-' or not filename:
1063             return filename
1064
1065         return self.get_output_path(dir_type, filename)
1066
1067     def _match_entry(self, info_dict, incomplete=False, silent=False):
1068         """ Returns None if the file should be downloaded """
1069
1070         video_title = info_dict.get('title', info_dict.get('id', 'video'))
1071
1072         def check_filter():
1073             if 'title' in info_dict:
1074                 # This can happen when we're just evaluating the playlist
1075                 title = info_dict['title']
1076                 matchtitle = self.params.get('matchtitle', False)
1077                 if matchtitle:
1078                     if not re.search(matchtitle, title, re.IGNORECASE):
1079                         return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1080                 rejecttitle = self.params.get('rejecttitle', False)
1081                 if rejecttitle:
1082                     if re.search(rejecttitle, title, re.IGNORECASE):
1083                         return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1084             date = info_dict.get('upload_date')
1085             if date is not None:
1086                 dateRange = self.params.get('daterange', DateRange())
1087                 if date not in dateRange:
1088                     return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1089             view_count = info_dict.get('view_count')
1090             if view_count is not None:
1091                 min_views = self.params.get('min_views')
1092                 if min_views is not None and view_count < min_views:
1093                     return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1094                 max_views = self.params.get('max_views')
1095                 if max_views is not None and view_count > max_views:
1096                     return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1097             if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1098                 return 'Skipping "%s" because it is age restricted' % video_title
1099
1100             if not incomplete:
1101                 match_filter = self.params.get('match_filter')
1102                 if match_filter is not None:
1103                     ret = match_filter(info_dict)
1104                     if ret is not None:
1105                         return ret
1106             return None
1107
1108         if self.in_download_archive(info_dict):
1109             reason = '%s has already been recorded in the archive' % video_title
1110             break_opt, break_err = 'break_on_existing', ExistingVideoReached
1111         else:
1112             reason = check_filter()
1113             break_opt, break_err = 'break_on_reject', RejectedVideoReached
1114         if reason is not None:
1115             if not silent:
1116                 self.to_screen('[download] ' + reason)
1117             if self.params.get(break_opt, False):
1118                 raise break_err()
1119         return reason
1120
1121     @staticmethod
1122     def add_extra_info(info_dict, extra_info):
1123         '''Set the keys from extra_info in info dict if they are missing'''
1124         for key, value in extra_info.items():
1125             info_dict.setdefault(key, value)
1126
1127     def extract_info(self, url, download=True, ie_key=None, extra_info={},
1128                      process=True, force_generic_extractor=False):
1129         """
1130         Return a list with a dictionary for each video extracted.
1131
1132         Arguments:
1133         url -- URL to extract
1134
1135         Keyword arguments:
1136         download -- whether to download videos during extraction
1137         ie_key -- extractor key hint
1138         extra_info -- dictionary containing the extra values to add to each result
1139         process -- whether to resolve all unresolved references (URLs, playlist items),
1140             must be True for download to work.
1141         force_generic_extractor -- force using the generic extractor
1142         """
1143
1144         if not ie_key and force_generic_extractor:
1145             ie_key = 'Generic'
1146
1147         if ie_key:
1148             ies = [self.get_info_extractor(ie_key)]
1149         else:
1150             ies = self._ies
1151
1152         for ie in ies:
1153             if not ie.suitable(url):
1154                 continue
1155
1156             ie_key = ie.ie_key()
1157             ie = self.get_info_extractor(ie_key)
1158             if not ie.working():
1159                 self.report_warning('The program functionality for this site has been marked as broken, '
1160                                     'and will probably not work.')
1161
1162             try:
1163                 temp_id = str_or_none(
1164                     ie.extract_id(url) if callable(getattr(ie, 'extract_id', None))
1165                     else ie._match_id(url))
1166             except (AssertionError, IndexError, AttributeError):
1167                 temp_id = None
1168             if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1169                 self.to_screen("[%s] %s: has already been recorded in archive" % (
1170                                ie_key, temp_id))
1171                 break
1172             return self.__extract_info(url, ie, download, extra_info, process)
1173         else:
1174             self.report_error('no suitable InfoExtractor for URL %s' % url)
1175
1176     def __handle_extraction_exceptions(func, handle_all_errors=True):
1177         def wrapper(self, *args, **kwargs):
1178             try:
1179                 return func(self, *args, **kwargs)
1180             except GeoRestrictedError as e:
1181                 msg = e.msg
1182                 if e.countries:
1183                     msg += '\nThis video is available in %s.' % ', '.join(
1184                         map(ISO3166Utils.short2full, e.countries))
1185                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1186                 self.report_error(msg)
1187             except ExtractorError as e:  # An error we somewhat expected
1188                 self.report_error(compat_str(e), e.format_traceback())
1189             except ThrottledDownload:
1190                 self.to_stderr('\r')
1191                 self.report_warning('The download speed is below throttle limit. Re-extracting data')
1192                 return wrapper(self, *args, **kwargs)
1193             except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached):
1194                 raise
1195             except Exception as e:
1196                 if handle_all_errors and self.params.get('ignoreerrors', False):
1197                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1198                 else:
1199                     raise
1200         return wrapper
1201
1202     @__handle_extraction_exceptions
1203     def __extract_info(self, url, ie, download, extra_info, process):
1204         ie_result = ie.extract(url)
1205         if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
1206             return
1207         if isinstance(ie_result, list):
1208             # Backwards compatibility: old IE result format
1209             ie_result = {
1210                 '_type': 'compat_list',
1211                 'entries': ie_result,
1212             }
1213         if extra_info.get('original_url'):
1214             ie_result.setdefault('original_url', extra_info['original_url'])
1215         self.add_default_extra_info(ie_result, ie, url)
1216         if process:
1217             return self.process_ie_result(ie_result, download, extra_info)
1218         else:
1219             return ie_result
1220
1221     def add_default_extra_info(self, ie_result, ie, url):
1222         if url is not None:
1223             self.add_extra_info(ie_result, {
1224                 'webpage_url': url,
1225                 'original_url': url,
1226                 'webpage_url_basename': url_basename(url),
1227             })
1228         if ie is not None:
1229             self.add_extra_info(ie_result, {
1230                 'extractor': ie.IE_NAME,
1231                 'extractor_key': ie.ie_key(),
1232             })
1233
1234     def process_ie_result(self, ie_result, download=True, extra_info={}):
1235         """
1236         Take the result of the ie(may be modified) and resolve all unresolved
1237         references (URLs, playlist items).
1238
1239         It will also download the videos if 'download'.
1240         Returns the resolved ie_result.
1241         """
1242         result_type = ie_result.get('_type', 'video')
1243
1244         if result_type in ('url', 'url_transparent'):
1245             ie_result['url'] = sanitize_url(ie_result['url'])
1246             if ie_result.get('original_url'):
1247                 extra_info.setdefault('original_url', ie_result['original_url'])
1248
1249             extract_flat = self.params.get('extract_flat', False)
1250             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1251                     or extract_flat is True):
1252                 info_copy = ie_result.copy()
1253                 self.add_extra_info(info_copy, extra_info)
1254                 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1255                 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1256                 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1257                 return ie_result
1258
1259         if result_type == 'video':
1260             self.add_extra_info(ie_result, extra_info)
1261             ie_result = self.process_video_result(ie_result, download=download)
1262             additional_urls = (ie_result or {}).get('additional_urls')
1263             if additional_urls:
1264                 # TODO: Improve MetadataFromFieldPP to allow setting a list
1265                 if isinstance(additional_urls, compat_str):
1266                     additional_urls = [additional_urls]
1267                 self.to_screen(
1268                     '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1269                 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1270                 ie_result['additional_entries'] = [
1271                     self.extract_info(
1272                         url, download, extra_info,
1273                         force_generic_extractor=self.params.get('force_generic_extractor'))
1274                     for url in additional_urls
1275                 ]
1276             return ie_result
1277         elif result_type == 'url':
1278             # We have to add extra_info to the results because it may be
1279             # contained in a playlist
1280             return self.extract_info(
1281                 ie_result['url'], download,
1282                 ie_key=ie_result.get('ie_key'),
1283                 extra_info=extra_info)
1284         elif result_type == 'url_transparent':
1285             # Use the information from the embedding page
1286             info = self.extract_info(
1287                 ie_result['url'], ie_key=ie_result.get('ie_key'),
1288                 extra_info=extra_info, download=False, process=False)
1289
1290             # extract_info may return None when ignoreerrors is enabled and
1291             # extraction failed with an error, don't crash and return early
1292             # in this case
1293             if not info:
1294                 return info
1295
1296             force_properties = dict(
1297                 (k, v) for k, v in ie_result.items() if v is not None)
1298             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1299                 if f in force_properties:
1300                     del force_properties[f]
1301             new_result = info.copy()
1302             new_result.update(force_properties)
1303
1304             # Extracted info may not be a video result (i.e.
1305             # info.get('_type', 'video') != video) but rather an url or
1306             # url_transparent. In such cases outer metadata (from ie_result)
1307             # should be propagated to inner one (info). For this to happen
1308             # _type of info should be overridden with url_transparent. This
1309             # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1310             if new_result.get('_type') == 'url':
1311                 new_result['_type'] = 'url_transparent'
1312
1313             return self.process_ie_result(
1314                 new_result, download=download, extra_info=extra_info)
1315         elif result_type in ('playlist', 'multi_video'):
1316             # Protect from infinite recursion due to recursively nested playlists
1317             # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1318             webpage_url = ie_result['webpage_url']
1319             if webpage_url in self._playlist_urls:
1320                 self.to_screen(
1321                     '[download] Skipping already downloaded playlist: %s'
1322                     % ie_result.get('title') or ie_result.get('id'))
1323                 return
1324
1325             self._playlist_level += 1
1326             self._playlist_urls.add(webpage_url)
1327             self._sanitize_thumbnails(ie_result)
1328             try:
1329                 return self.__process_playlist(ie_result, download)
1330             finally:
1331                 self._playlist_level -= 1
1332                 if not self._playlist_level:
1333                     self._playlist_urls.clear()
1334         elif result_type == 'compat_list':
1335             self.report_warning(
1336                 'Extractor %s returned a compat_list result. '
1337                 'It needs to be updated.' % ie_result.get('extractor'))
1338
1339             def _fixup(r):
1340                 self.add_extra_info(
1341                     r,
1342                     {
1343                         'extractor': ie_result['extractor'],
1344                         'webpage_url': ie_result['webpage_url'],
1345                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1346                         'extractor_key': ie_result['extractor_key'],
1347                     }
1348                 )
1349                 return r
1350             ie_result['entries'] = [
1351                 self.process_ie_result(_fixup(r), download, extra_info)
1352                 for r in ie_result['entries']
1353             ]
1354             return ie_result
1355         else:
1356             raise Exception('Invalid result type: %s' % result_type)
1357
1358     def _ensure_dir_exists(self, path):
1359         return make_dir(path, self.report_error)
1360
1361     def __process_playlist(self, ie_result, download):
1362         # We process each entry in the playlist
1363         playlist = ie_result.get('title') or ie_result.get('id')
1364         self.to_screen('[download] Downloading playlist: %s' % playlist)
1365
1366         if 'entries' not in ie_result:
1367             raise EntryNotInPlaylist()
1368         incomplete_entries = bool(ie_result.get('requested_entries'))
1369         if incomplete_entries:
1370             def fill_missing_entries(entries, indexes):
1371                 ret = [None] * max(*indexes)
1372                 for i, entry in zip(indexes, entries):
1373                     ret[i - 1] = entry
1374                 return ret
1375             ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1376
1377         playlist_results = []
1378
1379         playliststart = self.params.get('playliststart', 1)
1380         playlistend = self.params.get('playlistend')
1381         # For backwards compatibility, interpret -1 as whole list
1382         if playlistend == -1:
1383             playlistend = None
1384
1385         playlistitems_str = self.params.get('playlist_items')
1386         playlistitems = None
1387         if playlistitems_str is not None:
1388             def iter_playlistitems(format):
1389                 for string_segment in format.split(','):
1390                     if '-' in string_segment:
1391                         start, end = string_segment.split('-')
1392                         for item in range(int(start), int(end) + 1):
1393                             yield int(item)
1394                     else:
1395                         yield int(string_segment)
1396             playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1397
1398         ie_entries = ie_result['entries']
1399         msg = (
1400             'Downloading %d videos' if not isinstance(ie_entries, list)
1401             else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1402         if not isinstance(ie_entries, (list, PagedList)):
1403             ie_entries = LazyList(ie_entries)
1404
1405         def get_entry(i):
1406             return YoutubeDL.__handle_extraction_exceptions(
1407                 lambda self, i: ie_entries[i - 1],
1408                 False
1409             )(self, i)
1410
1411         entries = []
1412         for i in playlistitems or itertools.count(playliststart):
1413             if playlistitems is None and playlistend is not None and playlistend < i:
1414                 break
1415             entry = None
1416             try:
1417                 entry = get_entry(i)
1418                 if entry is None:
1419                     raise EntryNotInPlaylist()
1420             except (IndexError, EntryNotInPlaylist):
1421                 if incomplete_entries:
1422                     raise EntryNotInPlaylist()
1423                 elif not playlistitems:
1424                     break
1425             entries.append(entry)
1426             try:
1427                 if entry is not None:
1428                     self._match_entry(entry, incomplete=True, silent=True)
1429             except (ExistingVideoReached, RejectedVideoReached):
1430                 break
1431         ie_result['entries'] = entries
1432
1433         # Save playlist_index before re-ordering
1434         entries = [
1435             ((playlistitems[i - 1] if playlistitems else i), entry)
1436             for i, entry in enumerate(entries, 1)
1437             if entry is not None]
1438         n_entries = len(entries)
1439
1440         if not playlistitems and (playliststart or playlistend):
1441             playlistitems = list(range(playliststart, playliststart + n_entries))
1442         ie_result['requested_entries'] = playlistitems
1443
1444         if self.params.get('allow_playlist_files', True):
1445             ie_copy = {
1446                 'playlist': playlist,
1447                 'playlist_id': ie_result.get('id'),
1448                 'playlist_title': ie_result.get('title'),
1449                 'playlist_uploader': ie_result.get('uploader'),
1450                 'playlist_uploader_id': ie_result.get('uploader_id'),
1451                 'playlist_index': 0,
1452             }
1453             ie_copy.update(dict(ie_result))
1454
1455             if self.params.get('writeinfojson', False):
1456                 infofn = self.prepare_filename(ie_copy, 'pl_infojson')
1457                 if not self._ensure_dir_exists(encodeFilename(infofn)):
1458                     return
1459                 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
1460                     self.to_screen('[info] Playlist metadata is already present')
1461                 else:
1462                     self.to_screen('[info] Writing playlist metadata as JSON to: ' + infofn)
1463                     try:
1464                         write_json_file(self.filter_requested_info(ie_result, self.params.get('clean_infojson', True)), infofn)
1465                     except (OSError, IOError):
1466                         self.report_error('Cannot write playlist metadata to JSON file ' + infofn)
1467
1468             # TODO: This should be passed to ThumbnailsConvertor if necessary
1469             self._write_thumbnails(ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1470
1471             if self.params.get('writedescription', False):
1472                 descfn = self.prepare_filename(ie_copy, 'pl_description')
1473                 if not self._ensure_dir_exists(encodeFilename(descfn)):
1474                     return
1475                 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
1476                     self.to_screen('[info] Playlist description is already present')
1477                 elif ie_result.get('description') is None:
1478                     self.report_warning('There\'s no playlist description to write.')
1479                 else:
1480                     try:
1481                         self.to_screen('[info] Writing playlist description to: ' + descfn)
1482                         with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1483                             descfile.write(ie_result['description'])
1484                     except (OSError, IOError):
1485                         self.report_error('Cannot write playlist description file ' + descfn)
1486                         return
1487
1488         if self.params.get('playlistreverse', False):
1489             entries = entries[::-1]
1490         if self.params.get('playlistrandom', False):
1491             random.shuffle(entries)
1492
1493         x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1494
1495         self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1496         failures = 0
1497         max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1498         for i, entry_tuple in enumerate(entries, 1):
1499             playlist_index, entry = entry_tuple
1500             if 'playlist_index' in self.params.get('compat_options', []):
1501                 playlist_index = playlistitems[i - 1] if playlistitems else i
1502             self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1503             # This __x_forwarded_for_ip thing is a bit ugly but requires
1504             # minimal changes
1505             if x_forwarded_for:
1506                 entry['__x_forwarded_for_ip'] = x_forwarded_for
1507             extra = {
1508                 'n_entries': n_entries,
1509                 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1510                 'playlist_index': playlist_index,
1511                 'playlist_autonumber': i,
1512                 'playlist': playlist,
1513                 'playlist_id': ie_result.get('id'),
1514                 'playlist_title': ie_result.get('title'),
1515                 'playlist_uploader': ie_result.get('uploader'),
1516                 'playlist_uploader_id': ie_result.get('uploader_id'),
1517                 'extractor': ie_result['extractor'],
1518                 'webpage_url': ie_result['webpage_url'],
1519                 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1520                 'extractor_key': ie_result['extractor_key'],
1521             }
1522
1523             if self._match_entry(entry, incomplete=True) is not None:
1524                 continue
1525
1526             entry_result = self.__process_iterable_entry(entry, download, extra)
1527             if not entry_result:
1528                 failures += 1
1529             if failures >= max_failures:
1530                 self.report_error(
1531                     'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1532                 break
1533             # TODO: skip failed (empty) entries?
1534             playlist_results.append(entry_result)
1535         ie_result['entries'] = playlist_results
1536         self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1537         return ie_result
1538
1539     @__handle_extraction_exceptions
1540     def __process_iterable_entry(self, entry, download, extra_info):
1541         return self.process_ie_result(
1542             entry, download=download, extra_info=extra_info)
1543
1544     def _build_format_filter(self, filter_spec):
1545         " Returns a function to filter the formats according to the filter_spec "
1546
1547         OPERATORS = {
1548             '<': operator.lt,
1549             '<=': operator.le,
1550             '>': operator.gt,
1551             '>=': operator.ge,
1552             '=': operator.eq,
1553             '!=': operator.ne,
1554         }
1555         operator_rex = re.compile(r'''(?x)\s*
1556             (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1557             (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1558             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1559             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1560         m = operator_rex.fullmatch(filter_spec)
1561         if m:
1562             try:
1563                 comparison_value = int(m.group('value'))
1564             except ValueError:
1565                 comparison_value = parse_filesize(m.group('value'))
1566                 if comparison_value is None:
1567                     comparison_value = parse_filesize(m.group('value') + 'B')
1568                 if comparison_value is None:
1569                     raise ValueError(
1570                         'Invalid value %r in format specification %r' % (
1571                             m.group('value'), filter_spec))
1572             op = OPERATORS[m.group('op')]
1573
1574         if not m:
1575             STR_OPERATORS = {
1576                 '=': operator.eq,
1577                 '^=': lambda attr, value: attr.startswith(value),
1578                 '$=': lambda attr, value: attr.endswith(value),
1579                 '*=': lambda attr, value: value in attr,
1580             }
1581             str_operator_rex = re.compile(r'''(?x)\s*
1582                 (?P<key>[a-zA-Z0-9._-]+)\s*
1583                 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1584                 (?P<value>[a-zA-Z0-9._-]+)\s*
1585                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1586             m = str_operator_rex.fullmatch(filter_spec)
1587             if m:
1588                 comparison_value = m.group('value')
1589                 str_op = STR_OPERATORS[m.group('op')]
1590                 if m.group('negation'):
1591                     op = lambda attr, value: not str_op(attr, value)
1592                 else:
1593                     op = str_op
1594
1595         if not m:
1596             raise SyntaxError('Invalid filter specification %r' % filter_spec)
1597
1598         def _filter(f):
1599             actual_value = f.get(m.group('key'))
1600             if actual_value is None:
1601                 return m.group('none_inclusive')
1602             return op(actual_value, comparison_value)
1603         return _filter
1604
1605     def _default_format_spec(self, info_dict, download=True):
1606
1607         def can_merge():
1608             merger = FFmpegMergerPP(self)
1609             return merger.available and merger.can_merge()
1610
1611         prefer_best = (
1612             not self.params.get('simulate', False)
1613             and download
1614             and (
1615                 not can_merge()
1616                 or info_dict.get('is_live', False)
1617                 or self.outtmpl_dict['default'] == '-'))
1618         compat = (
1619             prefer_best
1620             or self.params.get('allow_multiple_audio_streams', False)
1621             or 'format-spec' in self.params.get('compat_opts', []))
1622
1623         return (
1624             'best/bestvideo+bestaudio' if prefer_best
1625             else 'bestvideo*+bestaudio/best' if not compat
1626             else 'bestvideo+bestaudio/best')
1627
1628     def build_format_selector(self, format_spec):
1629         def syntax_error(note, start):
1630             message = (
1631                 'Invalid format specification: '
1632                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1633             return SyntaxError(message)
1634
1635         PICKFIRST = 'PICKFIRST'
1636         MERGE = 'MERGE'
1637         SINGLE = 'SINGLE'
1638         GROUP = 'GROUP'
1639         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1640
1641         allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1642                                   'video': self.params.get('allow_multiple_video_streams', False)}
1643
1644         check_formats = self.params.get('check_formats')
1645
1646         def _parse_filter(tokens):
1647             filter_parts = []
1648             for type, string, start, _, _ in tokens:
1649                 if type == tokenize.OP and string == ']':
1650                     return ''.join(filter_parts)
1651                 else:
1652                     filter_parts.append(string)
1653
1654         def _remove_unused_ops(tokens):
1655             # Remove operators that we don't use and join them with the surrounding strings
1656             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1657             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1658             last_string, last_start, last_end, last_line = None, None, None, None
1659             for type, string, start, end, line in tokens:
1660                 if type == tokenize.OP and string == '[':
1661                     if last_string:
1662                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1663                         last_string = None
1664                     yield type, string, start, end, line
1665                     # everything inside brackets will be handled by _parse_filter
1666                     for type, string, start, end, line in tokens:
1667                         yield type, string, start, end, line
1668                         if type == tokenize.OP and string == ']':
1669                             break
1670                 elif type == tokenize.OP and string in ALLOWED_OPS:
1671                     if last_string:
1672                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1673                         last_string = None
1674                     yield type, string, start, end, line
1675                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1676                     if not last_string:
1677                         last_string = string
1678                         last_start = start
1679                         last_end = end
1680                     else:
1681                         last_string += string
1682             if last_string:
1683                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1684
1685         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1686             selectors = []
1687             current_selector = None
1688             for type, string, start, _, _ in tokens:
1689                 # ENCODING is only defined in python 3.x
1690                 if type == getattr(tokenize, 'ENCODING', None):
1691                     continue
1692                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1693                     current_selector = FormatSelector(SINGLE, string, [])
1694                 elif type == tokenize.OP:
1695                     if string == ')':
1696                         if not inside_group:
1697                             # ')' will be handled by the parentheses group
1698                             tokens.restore_last_token()
1699                         break
1700                     elif inside_merge and string in ['/', ',']:
1701                         tokens.restore_last_token()
1702                         break
1703                     elif inside_choice and string == ',':
1704                         tokens.restore_last_token()
1705                         break
1706                     elif string == ',':
1707                         if not current_selector:
1708                             raise syntax_error('"," must follow a format selector', start)
1709                         selectors.append(current_selector)
1710                         current_selector = None
1711                     elif string == '/':
1712                         if not current_selector:
1713                             raise syntax_error('"/" must follow a format selector', start)
1714                         first_choice = current_selector
1715                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1716                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1717                     elif string == '[':
1718                         if not current_selector:
1719                             current_selector = FormatSelector(SINGLE, 'best', [])
1720                         format_filter = _parse_filter(tokens)
1721                         current_selector.filters.append(format_filter)
1722                     elif string == '(':
1723                         if current_selector:
1724                             raise syntax_error('Unexpected "("', start)
1725                         group = _parse_format_selection(tokens, inside_group=True)
1726                         current_selector = FormatSelector(GROUP, group, [])
1727                     elif string == '+':
1728                         if not current_selector:
1729                             raise syntax_error('Unexpected "+"', start)
1730                         selector_1 = current_selector
1731                         selector_2 = _parse_format_selection(tokens, inside_merge=True)
1732                         if not selector_2:
1733                             raise syntax_error('Expected a selector', start)
1734                         current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1735                     else:
1736                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1737                 elif type == tokenize.ENDMARKER:
1738                     break
1739             if current_selector:
1740                 selectors.append(current_selector)
1741             return selectors
1742
1743         def _merge(formats_pair):
1744             format_1, format_2 = formats_pair
1745
1746             formats_info = []
1747             formats_info.extend(format_1.get('requested_formats', (format_1,)))
1748             formats_info.extend(format_2.get('requested_formats', (format_2,)))
1749
1750             if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1751                 get_no_more = {'video': False, 'audio': False}
1752                 for (i, fmt_info) in enumerate(formats_info):
1753                     if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1754                         formats_info.pop(i)
1755                         continue
1756                     for aud_vid in ['audio', 'video']:
1757                         if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1758                             if get_no_more[aud_vid]:
1759                                 formats_info.pop(i)
1760                                 break
1761                             get_no_more[aud_vid] = True
1762
1763             if len(formats_info) == 1:
1764                 return formats_info[0]
1765
1766             video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1767             audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1768
1769             the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1770             the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1771
1772             output_ext = self.params.get('merge_output_format')
1773             if not output_ext:
1774                 if the_only_video:
1775                     output_ext = the_only_video['ext']
1776                 elif the_only_audio and not video_fmts:
1777                     output_ext = the_only_audio['ext']
1778                 else:
1779                     output_ext = 'mkv'
1780
1781             new_dict = {
1782                 'requested_formats': formats_info,
1783                 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1784                 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1785                 'ext': output_ext,
1786             }
1787
1788             if the_only_video:
1789                 new_dict.update({
1790                     'width': the_only_video.get('width'),
1791                     'height': the_only_video.get('height'),
1792                     'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
1793                     'fps': the_only_video.get('fps'),
1794                     'vcodec': the_only_video.get('vcodec'),
1795                     'vbr': the_only_video.get('vbr'),
1796                     'stretched_ratio': the_only_video.get('stretched_ratio'),
1797                 })
1798
1799             if the_only_audio:
1800                 new_dict.update({
1801                     'acodec': the_only_audio.get('acodec'),
1802                     'abr': the_only_audio.get('abr'),
1803                 })
1804
1805             return new_dict
1806
1807         def _check_formats(formats):
1808             if not check_formats:
1809                 yield from formats
1810                 return
1811             for f in formats:
1812                 self.to_screen('[info] Testing format %s' % f['format_id'])
1813                 temp_file = tempfile.NamedTemporaryFile(
1814                     suffix='.tmp', delete=False,
1815                     dir=self.get_output_path('temp') or None)
1816                 temp_file.close()
1817                 try:
1818                     success, _ = self.dl(temp_file.name, f, test=True)
1819                 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1820                     success = False
1821                 finally:
1822                     if os.path.exists(temp_file.name):
1823                         try:
1824                             os.remove(temp_file.name)
1825                         except OSError:
1826                             self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1827                 if success:
1828                     yield f
1829                 else:
1830                     self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1831
1832         def _build_selector_function(selector):
1833             if isinstance(selector, list):  # ,
1834                 fs = [_build_selector_function(s) for s in selector]
1835
1836                 def selector_function(ctx):
1837                     for f in fs:
1838                         yield from f(ctx)
1839                 return selector_function
1840
1841             elif selector.type == GROUP:  # ()
1842                 selector_function = _build_selector_function(selector.selector)
1843
1844             elif selector.type == PICKFIRST:  # /
1845                 fs = [_build_selector_function(s) for s in selector.selector]
1846
1847                 def selector_function(ctx):
1848                     for f in fs:
1849                         picked_formats = list(f(ctx))
1850                         if picked_formats:
1851                             return picked_formats
1852                     return []
1853
1854             elif selector.type == MERGE:  # +
1855                 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1856
1857                 def selector_function(ctx):
1858                     for pair in itertools.product(
1859                             selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1860                         yield _merge(pair)
1861
1862             elif selector.type == SINGLE:  # atom
1863                 format_spec = selector.selector or 'best'
1864
1865                 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
1866                 if format_spec == 'all':
1867                     def selector_function(ctx):
1868                         yield from _check_formats(ctx['formats'])
1869                 elif format_spec == 'mergeall':
1870                     def selector_function(ctx):
1871                         formats = list(_check_formats(ctx['formats']))
1872                         if not formats:
1873                             return
1874                         merged_format = formats[-1]
1875                         for f in formats[-2::-1]:
1876                             merged_format = _merge((merged_format, f))
1877                         yield merged_format
1878
1879                 else:
1880                     format_fallback, format_reverse, format_idx = False, True, 1
1881                     mobj = re.match(
1882                         r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
1883                         format_spec)
1884                     if mobj is not None:
1885                         format_idx = int_or_none(mobj.group('n'), default=1)
1886                         format_reverse = mobj.group('bw')[0] == 'b'
1887                         format_type = (mobj.group('type') or [None])[0]
1888                         not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
1889                         format_modified = mobj.group('mod') is not None
1890
1891                         format_fallback = not format_type and not format_modified  # for b, w
1892                         _filter_f = (
1893                             (lambda f: f.get('%scodec' % format_type) != 'none')
1894                             if format_type and format_modified  # bv*, ba*, wv*, wa*
1895                             else (lambda f: f.get('%scodec' % not_format_type) == 'none')
1896                             if format_type  # bv, ba, wv, wa
1897                             else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1898                             if not format_modified  # b, w
1899                             else lambda f: True)  # b*, w*
1900                         filter_f = lambda f: _filter_f(f) and (
1901                             f.get('vcodec') != 'none' or f.get('acodec') != 'none')
1902                     else:
1903                         filter_f = ((lambda f: f.get('ext') == format_spec)
1904                                     if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']  # extension
1905                                     else (lambda f: f.get('format_id') == format_spec))  # id
1906
1907                     def selector_function(ctx):
1908                         formats = list(ctx['formats'])
1909                         matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1910                         if format_fallback and ctx['incomplete_formats'] and not matches:
1911                             # for extractors with incomplete formats (audio only (soundcloud)
1912                             # or video only (imgur)) best/worst will fallback to
1913                             # best/worst {video,audio}-only format
1914                             matches = formats
1915                         matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
1916                         try:
1917                             yield matches[format_idx - 1]
1918                         except IndexError:
1919                             return
1920
1921             filters = [self._build_format_filter(f) for f in selector.filters]
1922
1923             def final_selector(ctx):
1924                 ctx_copy = copy.deepcopy(ctx)
1925                 for _filter in filters:
1926                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1927                 return selector_function(ctx_copy)
1928             return final_selector
1929
1930         stream = io.BytesIO(format_spec.encode('utf-8'))
1931         try:
1932             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1933         except tokenize.TokenError:
1934             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1935
1936         class TokenIterator(object):
1937             def __init__(self, tokens):
1938                 self.tokens = tokens
1939                 self.counter = 0
1940
1941             def __iter__(self):
1942                 return self
1943
1944             def __next__(self):
1945                 if self.counter >= len(self.tokens):
1946                     raise StopIteration()
1947                 value = self.tokens[self.counter]
1948                 self.counter += 1
1949                 return value
1950
1951             next = __next__
1952
1953             def restore_last_token(self):
1954                 self.counter -= 1
1955
1956         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1957         return _build_selector_function(parsed_selector)
1958
1959     def _calc_headers(self, info_dict):
1960         res = std_headers.copy()
1961
1962         add_headers = info_dict.get('http_headers')
1963         if add_headers:
1964             res.update(add_headers)
1965
1966         cookies = self._calc_cookies(info_dict)
1967         if cookies:
1968             res['Cookie'] = cookies
1969
1970         if 'X-Forwarded-For' not in res:
1971             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1972             if x_forwarded_for_ip:
1973                 res['X-Forwarded-For'] = x_forwarded_for_ip
1974
1975         return res
1976
1977     def _calc_cookies(self, info_dict):
1978         pr = sanitized_Request(info_dict['url'])
1979         self.cookiejar.add_cookie_header(pr)
1980         return pr.get_header('Cookie')
1981
1982     def _sanitize_thumbnails(self, info_dict):
1983         thumbnails = info_dict.get('thumbnails')
1984         if thumbnails is None:
1985             thumbnail = info_dict.get('thumbnail')
1986             if thumbnail:
1987                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1988         if thumbnails:
1989             thumbnails.sort(key=lambda t: (
1990                 t.get('preference') if t.get('preference') is not None else -1,
1991                 t.get('width') if t.get('width') is not None else -1,
1992                 t.get('height') if t.get('height') is not None else -1,
1993                 t.get('id') if t.get('id') is not None else '',
1994                 t.get('url')))
1995
1996             def thumbnail_tester():
1997                 if self.params.get('check_formats'):
1998                     test_all = True
1999                     to_screen = lambda msg: self.to_screen(f'[info] {msg}')
2000                 else:
2001                     test_all = False
2002                     to_screen = self.write_debug
2003
2004                 def test_thumbnail(t):
2005                     if not test_all and not t.get('_test_url'):
2006                         return True
2007                     to_screen('Testing thumbnail %s' % t['id'])
2008                     try:
2009                         self.urlopen(HEADRequest(t['url']))
2010                     except network_exceptions as err:
2011                         to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % (
2012                             t['id'], t['url'], error_to_compat_str(err)))
2013                         return False
2014                     return True
2015
2016                 return test_thumbnail
2017
2018             for i, t in enumerate(thumbnails):
2019                 if t.get('id') is None:
2020                     t['id'] = '%d' % i
2021                 if t.get('width') and t.get('height'):
2022                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
2023                 t['url'] = sanitize_url(t['url'])
2024
2025             if self.params.get('check_formats') is not False:
2026                 info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse()
2027             else:
2028                 info_dict['thumbnails'] = thumbnails
2029
2030     def process_video_result(self, info_dict, download=True):
2031         assert info_dict.get('_type', 'video') == 'video'
2032
2033         if 'id' not in info_dict:
2034             raise ExtractorError('Missing "id" field in extractor result')
2035         if 'title' not in info_dict:
2036             raise ExtractorError('Missing "title" field in extractor result')
2037
2038         def report_force_conversion(field, field_not, conversion):
2039             self.report_warning(
2040                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2041                 % (field, field_not, conversion))
2042
2043         def sanitize_string_field(info, string_field):
2044             field = info.get(string_field)
2045             if field is None or isinstance(field, compat_str):
2046                 return
2047             report_force_conversion(string_field, 'a string', 'string')
2048             info[string_field] = compat_str(field)
2049
2050         def sanitize_numeric_fields(info):
2051             for numeric_field in self._NUMERIC_FIELDS:
2052                 field = info.get(numeric_field)
2053                 if field is None or isinstance(field, compat_numeric_types):
2054                     continue
2055                 report_force_conversion(numeric_field, 'numeric', 'int')
2056                 info[numeric_field] = int_or_none(field)
2057
2058         sanitize_string_field(info_dict, 'id')
2059         sanitize_numeric_fields(info_dict)
2060
2061         if 'playlist' not in info_dict:
2062             # It isn't part of a playlist
2063             info_dict['playlist'] = None
2064             info_dict['playlist_index'] = None
2065
2066         self._sanitize_thumbnails(info_dict)
2067
2068         thumbnail = info_dict.get('thumbnail')
2069         thumbnails = info_dict.get('thumbnails')
2070         if thumbnail:
2071             info_dict['thumbnail'] = sanitize_url(thumbnail)
2072         elif thumbnails:
2073             info_dict['thumbnail'] = thumbnails[-1]['url']
2074
2075         if info_dict.get('display_id') is None and 'id' in info_dict:
2076             info_dict['display_id'] = info_dict['id']
2077
2078         for ts_key, date_key in (
2079                 ('timestamp', 'upload_date'),
2080                 ('release_timestamp', 'release_date'),
2081         ):
2082             if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2083                 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2084                 # see http://bugs.python.org/issue1646728)
2085                 try:
2086                     upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2087                     info_dict[date_key] = upload_date.strftime('%Y%m%d')
2088                 except (ValueError, OverflowError, OSError):
2089                     pass
2090
2091         live_keys = ('is_live', 'was_live')
2092         live_status = info_dict.get('live_status')
2093         if live_status is None:
2094             for key in live_keys:
2095                 if info_dict.get(key) is False:
2096                     continue
2097                 if info_dict.get(key):
2098                     live_status = key
2099                 break
2100             if all(info_dict.get(key) is False for key in live_keys):
2101                 live_status = 'not_live'
2102         if live_status:
2103             info_dict['live_status'] = live_status
2104             for key in live_keys:
2105                 if info_dict.get(key) is None:
2106                     info_dict[key] = (live_status == key)
2107
2108         # Auto generate title fields corresponding to the *_number fields when missing
2109         # in order to always have clean titles. This is very common for TV series.
2110         for field in ('chapter', 'season', 'episode'):
2111             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2112                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2113
2114         for cc_kind in ('subtitles', 'automatic_captions'):
2115             cc = info_dict.get(cc_kind)
2116             if cc:
2117                 for _, subtitle in cc.items():
2118                     for subtitle_format in subtitle:
2119                         if subtitle_format.get('url'):
2120                             subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2121                         if subtitle_format.get('ext') is None:
2122                             subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2123
2124         automatic_captions = info_dict.get('automatic_captions')
2125         subtitles = info_dict.get('subtitles')
2126
2127         info_dict['requested_subtitles'] = self.process_subtitles(
2128             info_dict['id'], subtitles, automatic_captions)
2129
2130         # We now pick which formats have to be downloaded
2131         if info_dict.get('formats') is None:
2132             # There's only one format available
2133             formats = [info_dict]
2134         else:
2135             formats = info_dict['formats']
2136
2137         if not formats:
2138             if not self.params.get('ignore_no_formats_error'):
2139                 raise ExtractorError('No video formats found!')
2140             else:
2141                 self.report_warning('No video formats found!')
2142
2143         def is_wellformed(f):
2144             url = f.get('url')
2145             if not url:
2146                 self.report_warning(
2147                     '"url" field is missing or empty - skipping format, '
2148                     'there is an error in extractor')
2149                 return False
2150             if isinstance(url, bytes):
2151                 sanitize_string_field(f, 'url')
2152             return True
2153
2154         # Filter out malformed formats for better extraction robustness
2155         formats = list(filter(is_wellformed, formats))
2156
2157         formats_dict = {}
2158
2159         # We check that all the formats have the format and format_id fields
2160         for i, format in enumerate(formats):
2161             sanitize_string_field(format, 'format_id')
2162             sanitize_numeric_fields(format)
2163             format['url'] = sanitize_url(format['url'])
2164             if not format.get('format_id'):
2165                 format['format_id'] = compat_str(i)
2166             else:
2167                 # Sanitize format_id from characters used in format selector expression
2168                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2169             format_id = format['format_id']
2170             if format_id not in formats_dict:
2171                 formats_dict[format_id] = []
2172             formats_dict[format_id].append(format)
2173
2174         # Make sure all formats have unique format_id
2175         for format_id, ambiguous_formats in formats_dict.items():
2176             if len(ambiguous_formats) > 1:
2177                 for i, format in enumerate(ambiguous_formats):
2178                     format['format_id'] = '%s-%d' % (format_id, i)
2179
2180         for i, format in enumerate(formats):
2181             if format.get('format') is None:
2182                 format['format'] = '{id} - {res}{note}'.format(
2183                     id=format['format_id'],
2184                     res=self.format_resolution(format),
2185                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
2186                 )
2187             # Automatically determine file extension if missing
2188             if format.get('ext') is None:
2189                 format['ext'] = determine_ext(format['url']).lower()
2190             # Automatically determine protocol if missing (useful for format
2191             # selection purposes)
2192             if format.get('protocol') is None:
2193                 format['protocol'] = determine_protocol(format)
2194             # Add HTTP headers, so that external programs can use them from the
2195             # json output
2196             full_format_info = info_dict.copy()
2197             full_format_info.update(format)
2198             format['http_headers'] = self._calc_headers(full_format_info)
2199         # Remove private housekeeping stuff
2200         if '__x_forwarded_for_ip' in info_dict:
2201             del info_dict['__x_forwarded_for_ip']
2202
2203         # TODO Central sorting goes here
2204
2205         if formats and formats[0] is not info_dict:
2206             # only set the 'formats' fields if the original info_dict list them
2207             # otherwise we end up with a circular reference, the first (and unique)
2208             # element in the 'formats' field in info_dict is info_dict itself,
2209             # which can't be exported to json
2210             info_dict['formats'] = formats
2211
2212         info_dict, _ = self.pre_process(info_dict)
2213
2214         list_only = self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')
2215         if list_only:
2216             self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2217             if self.params.get('list_thumbnails'):
2218                 self.list_thumbnails(info_dict)
2219             if self.params.get('listformats'):
2220                 if not info_dict.get('formats'):
2221                     raise ExtractorError('No video formats found', expected=True)
2222                 self.list_formats(info_dict)
2223             if self.params.get('listsubtitles'):
2224                 if 'automatic_captions' in info_dict:
2225                     self.list_subtitles(
2226                         info_dict['id'], automatic_captions, 'automatic captions')
2227                 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2228             return
2229
2230         format_selector = self.format_selector
2231         if format_selector is None:
2232             req_format = self._default_format_spec(info_dict, download=download)
2233             self.write_debug('Default format spec: %s' % req_format)
2234             format_selector = self.build_format_selector(req_format)
2235
2236         # While in format selection we may need to have an access to the original
2237         # format set in order to calculate some metrics or do some processing.
2238         # For now we need to be able to guess whether original formats provided
2239         # by extractor are incomplete or not (i.e. whether extractor provides only
2240         # video-only or audio-only formats) for proper formats selection for
2241         # extractors with such incomplete formats (see
2242         # https://github.com/ytdl-org/youtube-dl/pull/5556).
2243         # Since formats may be filtered during format selection and may not match
2244         # the original formats the results may be incorrect. Thus original formats
2245         # or pre-calculated metrics should be passed to format selection routines
2246         # as well.
2247         # We will pass a context object containing all necessary additional data
2248         # instead of just formats.
2249         # This fixes incorrect format selection issue (see
2250         # https://github.com/ytdl-org/youtube-dl/issues/10083).
2251         incomplete_formats = (
2252             # All formats are video-only or
2253             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2254             # all formats are audio-only
2255             or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2256
2257         ctx = {
2258             'formats': formats,
2259             'incomplete_formats': incomplete_formats,
2260         }
2261
2262         formats_to_download = list(format_selector(ctx))
2263         if not formats_to_download:
2264             if not self.params.get('ignore_no_formats_error'):
2265                 raise ExtractorError('Requested format is not available', expected=True)
2266             else:
2267                 self.report_warning('Requested format is not available')
2268                 # Process what we can, even without any available formats.
2269                 self.process_info(dict(info_dict))
2270         elif download:
2271             self.to_screen(
2272                 '[info] %s: Downloading %d format(s): %s' % (
2273                     info_dict['id'], len(formats_to_download),
2274                     ", ".join([f['format_id'] for f in formats_to_download])))
2275             for fmt in formats_to_download:
2276                 new_info = dict(info_dict)
2277                 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2278                 new_info['__original_infodict'] = info_dict
2279                 new_info.update(fmt)
2280                 self.process_info(new_info)
2281         # We update the info dict with the best quality format (backwards compatibility)
2282         if formats_to_download:
2283             info_dict.update(formats_to_download[-1])
2284         return info_dict
2285
2286     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2287         """Select the requested subtitles and their format"""
2288         available_subs = {}
2289         if normal_subtitles and self.params.get('writesubtitles'):
2290             available_subs.update(normal_subtitles)
2291         if automatic_captions and self.params.get('writeautomaticsub'):
2292             for lang, cap_info in automatic_captions.items():
2293                 if lang not in available_subs:
2294                     available_subs[lang] = cap_info
2295
2296         if (not self.params.get('writesubtitles') and not
2297                 self.params.get('writeautomaticsub') or not
2298                 available_subs):
2299             return None
2300
2301         all_sub_langs = available_subs.keys()
2302         if self.params.get('allsubtitles', False):
2303             requested_langs = all_sub_langs
2304         elif self.params.get('subtitleslangs', False):
2305             requested_langs = set()
2306             for lang in self.params.get('subtitleslangs'):
2307                 if lang == 'all':
2308                     requested_langs.update(all_sub_langs)
2309                     continue
2310                 discard = lang[0] == '-'
2311                 if discard:
2312                     lang = lang[1:]
2313                 current_langs = filter(re.compile(lang + '$').match, all_sub_langs)
2314                 if discard:
2315                     for lang in current_langs:
2316                         requested_langs.discard(lang)
2317                 else:
2318                     requested_langs.update(current_langs)
2319         elif 'en' in available_subs:
2320             requested_langs = ['en']
2321         else:
2322             requested_langs = [list(all_sub_langs)[0]]
2323         self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2324
2325         formats_query = self.params.get('subtitlesformat', 'best')
2326         formats_preference = formats_query.split('/') if formats_query else []
2327         subs = {}
2328         for lang in requested_langs:
2329             formats = available_subs.get(lang)
2330             if formats is None:
2331                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2332                 continue
2333             for ext in formats_preference:
2334                 if ext == 'best':
2335                     f = formats[-1]
2336                     break
2337                 matches = list(filter(lambda f: f['ext'] == ext, formats))
2338                 if matches:
2339                     f = matches[-1]
2340                     break
2341             else:
2342                 f = formats[-1]
2343                 self.report_warning(
2344                     'No subtitle format found matching "%s" for language %s, '
2345                     'using %s' % (formats_query, lang, f['ext']))
2346             subs[lang] = f
2347         return subs
2348
2349     def __forced_printings(self, info_dict, filename, incomplete):
2350         def print_mandatory(field, actual_field=None):
2351             if actual_field is None:
2352                 actual_field = field
2353             if (self.params.get('force%s' % field, False)
2354                     and (not incomplete or info_dict.get(actual_field) is not None)):
2355                 self.to_stdout(info_dict[actual_field])
2356
2357         def print_optional(field):
2358             if (self.params.get('force%s' % field, False)
2359                     and info_dict.get(field) is not None):
2360                 self.to_stdout(info_dict[field])
2361
2362         info_dict = info_dict.copy()
2363         if filename is not None:
2364             info_dict['filename'] = filename
2365         if info_dict.get('requested_formats') is not None:
2366             # For RTMP URLs, also include the playpath
2367             info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2368         elif 'url' in info_dict:
2369             info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2370
2371         for tmpl in self.params.get('forceprint', []):
2372             if re.match(r'\w+$', tmpl):
2373                 tmpl = '%({})s'.format(tmpl)
2374             tmpl, info_copy = self.prepare_outtmpl(tmpl, info_dict)
2375             self.to_stdout(self.escape_outtmpl(tmpl) % info_copy)
2376
2377         print_mandatory('title')
2378         print_mandatory('id')
2379         print_mandatory('url', 'urls')
2380         print_optional('thumbnail')
2381         print_optional('description')
2382         print_optional('filename')
2383         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
2384             self.to_stdout(formatSeconds(info_dict['duration']))
2385         print_mandatory('format')
2386
2387         if self.params.get('forcejson', False):
2388             self.post_extract(info_dict)
2389             self.to_stdout(json.dumps(info_dict, default=repr))
2390
2391     def dl(self, name, info, subtitle=False, test=False):
2392
2393         if test:
2394             verbose = self.params.get('verbose')
2395             params = {
2396                 'test': True,
2397                 'quiet': not verbose,
2398                 'verbose': verbose,
2399                 'noprogress': not verbose,
2400                 'nopart': True,
2401                 'skip_unavailable_fragments': False,
2402                 'keep_fragments': False,
2403                 'overwrites': True,
2404                 '_no_ytdl_file': True,
2405             }
2406         else:
2407             params = self.params
2408         fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2409         if not test:
2410             for ph in self._progress_hooks:
2411                 fd.add_progress_hook(ph)
2412             urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2413             self.write_debug('Invoking downloader on "%s"' % urls)
2414         new_info = dict(info)
2415         if new_info.get('http_headers') is None:
2416             new_info['http_headers'] = self._calc_headers(new_info)
2417         return fd.download(name, new_info, subtitle)
2418
2419     def process_info(self, info_dict):
2420         """Process a single resolved IE result."""
2421
2422         assert info_dict.get('_type', 'video') == 'video'
2423
2424         info_dict.setdefault('__postprocessors', [])
2425
2426         max_downloads = self.params.get('max_downloads')
2427         if max_downloads is not None:
2428             if self._num_downloads >= int(max_downloads):
2429                 raise MaxDownloadsReached()
2430
2431         # TODO: backward compatibility, to be removed
2432         info_dict['fulltitle'] = info_dict['title']
2433
2434         if 'format' not in info_dict and 'ext' in info_dict:
2435             info_dict['format'] = info_dict['ext']
2436
2437         if self._match_entry(info_dict) is not None:
2438             return
2439
2440         self.post_extract(info_dict)
2441         self._num_downloads += 1
2442
2443         # info_dict['_filename'] needs to be set for backward compatibility
2444         info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2445         temp_filename = self.prepare_filename(info_dict, 'temp')
2446         files_to_move = {}
2447
2448         # Forced printings
2449         self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2450
2451         if self.params.get('simulate', False):
2452             if self.params.get('force_write_download_archive', False):
2453                 self.record_download_archive(info_dict)
2454
2455             # Do nothing else if in simulate mode
2456             return
2457
2458         if full_filename is None:
2459             return
2460
2461         if not self._ensure_dir_exists(encodeFilename(full_filename)):
2462             return
2463         if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2464             return
2465
2466         if self.params.get('writedescription', False):
2467             descfn = self.prepare_filename(info_dict, 'description')
2468             if not self._ensure_dir_exists(encodeFilename(descfn)):
2469                 return
2470             if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
2471                 self.to_screen('[info] Video description is already present')
2472             elif info_dict.get('description') is None:
2473                 self.report_warning('There\'s no description to write.')
2474             else:
2475                 try:
2476                     self.to_screen('[info] Writing video description to: ' + descfn)
2477                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
2478                         descfile.write(info_dict['description'])
2479                 except (OSError, IOError):
2480                     self.report_error('Cannot write description file ' + descfn)
2481                     return
2482
2483         if self.params.get('writeannotations', False):
2484             annofn = self.prepare_filename(info_dict, 'annotation')
2485             if not self._ensure_dir_exists(encodeFilename(annofn)):
2486                 return
2487             if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2488                 self.to_screen('[info] Video annotations are already present')
2489             elif not info_dict.get('annotations'):
2490                 self.report_warning('There are no annotations to write.')
2491             else:
2492                 try:
2493                     self.to_screen('[info] Writing video annotations to: ' + annofn)
2494                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2495                         annofile.write(info_dict['annotations'])
2496                 except (KeyError, TypeError):
2497                     self.report_warning('There are no annotations to write.')
2498                 except (OSError, IOError):
2499                     self.report_error('Cannot write annotations file: ' + annofn)
2500                     return
2501
2502         subtitles_are_requested = any([self.params.get('writesubtitles', False),
2503                                        self.params.get('writeautomaticsub')])
2504
2505         if subtitles_are_requested and info_dict.get('requested_subtitles'):
2506             # subtitles download errors are already managed as troubles in relevant IE
2507             # that way it will silently go on when used with unsupporting IE
2508             subtitles = info_dict['requested_subtitles']
2509             # ie = self.get_info_extractor(info_dict['extractor_key'])
2510             for sub_lang, sub_info in subtitles.items():
2511                 sub_format = sub_info['ext']
2512                 sub_filename = subtitles_filename(temp_filename, sub_lang, sub_format, info_dict.get('ext'))
2513                 sub_filename_final = subtitles_filename(
2514                     self.prepare_filename(info_dict, 'subtitle'), sub_lang, sub_format, info_dict.get('ext'))
2515                 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(sub_filename)):
2516                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
2517                     sub_info['filepath'] = sub_filename
2518                     files_to_move[sub_filename] = sub_filename_final
2519                 else:
2520                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
2521                     if sub_info.get('data') is not None:
2522                         try:
2523                             # Use newline='' to prevent conversion of newline characters
2524                             # See https://github.com/ytdl-org/youtube-dl/issues/10268
2525                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
2526                                 subfile.write(sub_info['data'])
2527                             sub_info['filepath'] = sub_filename
2528                             files_to_move[sub_filename] = sub_filename_final
2529                         except (OSError, IOError):
2530                             self.report_error('Cannot write subtitles file ' + sub_filename)
2531                             return
2532                     else:
2533                         try:
2534                             self.dl(sub_filename, sub_info.copy(), subtitle=True)
2535                             sub_info['filepath'] = sub_filename
2536                             files_to_move[sub_filename] = sub_filename_final
2537                         except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
2538                             self.report_warning('Unable to download subtitle for "%s": %s' %
2539                                                 (sub_lang, error_to_compat_str(err)))
2540                             continue
2541
2542         if self.params.get('writeinfojson', False):
2543             infofn = self.prepare_filename(info_dict, 'infojson')
2544             if not self._ensure_dir_exists(encodeFilename(infofn)):
2545                 return
2546             if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
2547                 self.to_screen('[info] Video metadata is already present')
2548             else:
2549                 self.to_screen('[info] Writing video metadata as JSON to: ' + infofn)
2550                 try:
2551                     write_json_file(self.filter_requested_info(info_dict, self.params.get('clean_infojson', True)), infofn)
2552                 except (OSError, IOError):
2553                     self.report_error('Cannot write video metadata to JSON file ' + infofn)
2554                     return
2555             info_dict['__infojson_filename'] = infofn
2556
2557         for thumb_ext in self._write_thumbnails(info_dict, temp_filename):
2558             thumb_filename_temp = replace_extension(temp_filename, thumb_ext, info_dict.get('ext'))
2559             thumb_filename = replace_extension(
2560                 self.prepare_filename(info_dict, 'thumbnail'), thumb_ext, info_dict.get('ext'))
2561             files_to_move[thumb_filename_temp] = thumb_filename
2562
2563         # Write internet shortcut files
2564         url_link = webloc_link = desktop_link = False
2565         if self.params.get('writelink', False):
2566             if sys.platform == "darwin":  # macOS.
2567                 webloc_link = True
2568             elif sys.platform.startswith("linux"):
2569                 desktop_link = True
2570             else:  # if sys.platform in ['win32', 'cygwin']:
2571                 url_link = True
2572         if self.params.get('writeurllink', False):
2573             url_link = True
2574         if self.params.get('writewebloclink', False):
2575             webloc_link = True
2576         if self.params.get('writedesktoplink', False):
2577             desktop_link = True
2578
2579         if url_link or webloc_link or desktop_link:
2580             if 'webpage_url' not in info_dict:
2581                 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2582                 return
2583             ascii_url = iri_to_uri(info_dict['webpage_url'])
2584
2585         def _write_link_file(extension, template, newline, embed_filename):
2586             linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
2587             if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2588                 self.to_screen('[info] Internet shortcut is already present')
2589             else:
2590                 try:
2591                     self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
2592                     with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
2593                         template_vars = {'url': ascii_url}
2594                         if embed_filename:
2595                             template_vars['filename'] = linkfn[:-(len(extension) + 1)]
2596                         linkfile.write(template % template_vars)
2597                 except (OSError, IOError):
2598                     self.report_error('Cannot write internet shortcut ' + linkfn)
2599                     return False
2600             return True
2601
2602         if url_link:
2603             if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
2604                 return
2605         if webloc_link:
2606             if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
2607                 return
2608         if desktop_link:
2609             if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
2610                 return
2611
2612         try:
2613             info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2614         except PostProcessingError as err:
2615             self.report_error('Preprocessing: %s' % str(err))
2616             return
2617
2618         must_record_download_archive = False
2619         if self.params.get('skip_download', False):
2620             info_dict['filepath'] = temp_filename
2621             info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2622             info_dict['__files_to_move'] = files_to_move
2623             info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2624         else:
2625             # Download
2626             try:
2627
2628                 def existing_file(*filepaths):
2629                     ext = info_dict.get('ext')
2630                     final_ext = self.params.get('final_ext', ext)
2631                     existing_files = []
2632                     for file in orderedSet(filepaths):
2633                         if final_ext != ext:
2634                             converted = replace_extension(file, final_ext, ext)
2635                             if os.path.exists(encodeFilename(converted)):
2636                                 existing_files.append(converted)
2637                         if os.path.exists(encodeFilename(file)):
2638                             existing_files.append(file)
2639
2640                     if not existing_files or self.params.get('overwrites', False):
2641                         for file in orderedSet(existing_files):
2642                             self.report_file_delete(file)
2643                             os.remove(encodeFilename(file))
2644                         return None
2645
2646                     self.report_file_already_downloaded(existing_files[0])
2647                     info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2648                     return existing_files[0]
2649
2650                 success = True
2651                 if info_dict.get('requested_formats') is not None:
2652
2653                     def compatible_formats(formats):
2654                         # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2655                         video_formats = [format for format in formats if format.get('vcodec') != 'none']
2656                         audio_formats = [format for format in formats if format.get('acodec') != 'none']
2657                         if len(video_formats) > 2 or len(audio_formats) > 2:
2658                             return False
2659
2660                         # Check extension
2661                         exts = set(format.get('ext') for format in formats)
2662                         COMPATIBLE_EXTS = (
2663                             set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2664                             set(('webm',)),
2665                         )
2666                         for ext_sets in COMPATIBLE_EXTS:
2667                             if ext_sets.issuperset(exts):
2668                                 return True
2669                         # TODO: Check acodec/vcodec
2670                         return False
2671
2672                     requested_formats = info_dict['requested_formats']
2673                     old_ext = info_dict['ext']
2674                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2675                         info_dict['ext'] = 'mkv'
2676                         self.report_warning(
2677                             'Requested formats are incompatible for merge and will be merged into mkv.')
2678
2679                     def correct_ext(filename):
2680                         if filename == '-':
2681                             return filename
2682                         filename_real_ext = os.path.splitext(filename)[1][1:]
2683                         filename_wo_ext = (
2684                             os.path.splitext(filename)[0]
2685                             if filename_real_ext == old_ext
2686                             else filename)
2687                         return '%s.%s' % (filename_wo_ext, info_dict['ext'])
2688
2689                     # Ensure filename always has a correct extension for successful merge
2690                     full_filename = correct_ext(full_filename)
2691                     temp_filename = correct_ext(temp_filename)
2692                     dl_filename = existing_file(full_filename, temp_filename)
2693                     info_dict['__real_download'] = False
2694
2695                     _protocols = set(determine_protocol(f) for f in requested_formats)
2696                     if len(_protocols) == 1:  # All requested formats have same protocol
2697                         info_dict['protocol'] = _protocols.pop()
2698                     directly_mergable = FFmpegFD.can_merge_formats(info_dict)
2699                     if dl_filename is not None:
2700                         pass
2701                     elif (directly_mergable and get_suitable_downloader(
2702                             info_dict, self.params, to_stdout=(temp_filename== '-')) == FFmpegFD):
2703                         info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2704                         success, real_download = self.dl(temp_filename, info_dict)
2705                         info_dict['__real_download'] = real_download
2706                     else:
2707                         downloaded = []
2708                         merger = FFmpegMergerPP(self)
2709                         if self.params.get('allow_unplayable_formats'):
2710                             self.report_warning(
2711                                 'You have requested merging of multiple formats '
2712                                 'while also allowing unplayable formats to be downloaded. '
2713                                 'The formats won\'t be merged to prevent data corruption.')
2714                         elif not merger.available:
2715                             self.report_warning(
2716                                 'You have requested merging of multiple formats but ffmpeg is not installed. '
2717                                 'The formats won\'t be merged.')
2718
2719                         if temp_filename == '-':
2720                             reason = ('using a downloader other than ffmpeg' if directly_mergable
2721                                       else 'but the formats are incompatible for simultaneous download' if merger.available
2722                                       else 'but ffmpeg is not installed')
2723                             self.report_warning(
2724                                 f'You have requested downloading multiple formats to stdout {reason}. '
2725                                 'The formats will be streamed one after the other')
2726                             fname = temp_filename
2727                         for f in requested_formats:
2728                             new_info = dict(info_dict)
2729                             del new_info['requested_formats']
2730                             new_info.update(f)
2731                             if temp_filename != '-':
2732                                 fname = prepend_extension(temp_filename, 'f%s' % f['format_id'], new_info['ext'])
2733                                 if not self._ensure_dir_exists(fname):
2734                                     return
2735                                 downloaded.append(fname)
2736                             partial_success, real_download = self.dl(fname, new_info)
2737                             info_dict['__real_download'] = info_dict['__real_download'] or real_download
2738                             success = success and partial_success
2739                         if merger.available and not self.params.get('allow_unplayable_formats'):
2740                             info_dict['__postprocessors'].append(merger)
2741                             info_dict['__files_to_merge'] = downloaded
2742                             # Even if there were no downloads, it is being merged only now
2743                             info_dict['__real_download'] = True
2744                         else:
2745                             for file in downloaded:
2746                                 files_to_move[file] = None
2747                 else:
2748                     # Just a single file
2749                     dl_filename = existing_file(full_filename, temp_filename)
2750                     if dl_filename is None:
2751                         success, real_download = self.dl(temp_filename, info_dict)
2752                         info_dict['__real_download'] = real_download
2753
2754                 dl_filename = dl_filename or temp_filename
2755                 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2756
2757             except network_exceptions as err:
2758                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2759                 return
2760             except (OSError, IOError) as err:
2761                 raise UnavailableVideoError(err)
2762             except (ContentTooShortError, ) as err:
2763                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2764                 return
2765
2766             if success and full_filename != '-':
2767
2768                 def fixup():
2769                     do_fixup = True
2770                     fixup_policy = self.params.get('fixup')
2771                     vid = info_dict['id']
2772
2773                     if fixup_policy in ('ignore', 'never'):
2774                         return
2775                     elif fixup_policy == 'warn':
2776                         do_fixup = False
2777                     elif fixup_policy != 'force':
2778                         assert fixup_policy in ('detect_or_warn', None)
2779                         if not info_dict.get('__real_download'):
2780                             do_fixup = False
2781
2782                     def ffmpeg_fixup(cndn, msg, cls):
2783                         if not cndn:
2784                             return
2785                         if not do_fixup:
2786                             self.report_warning(f'{vid}: {msg}')
2787                             return
2788                         pp = cls(self)
2789                         if pp.available:
2790                             info_dict['__postprocessors'].append(pp)
2791                         else:
2792                             self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
2793
2794                     stretched_ratio = info_dict.get('stretched_ratio')
2795                     ffmpeg_fixup(
2796                         stretched_ratio not in (1, None),
2797                         f'Non-uniform pixel ratio {stretched_ratio}',
2798                         FFmpegFixupStretchedPP)
2799
2800                     ffmpeg_fixup(
2801                         (info_dict.get('requested_formats') is None
2802                          and info_dict.get('container') == 'm4a_dash'
2803                          and info_dict.get('ext') == 'm4a'),
2804                         'writing DASH m4a. Only some players support this container',
2805                         FFmpegFixupM4aPP)
2806
2807                     downloader = (get_suitable_downloader(info_dict, self.params).__name__
2808                                   if 'protocol' in info_dict else None)
2809                     ffmpeg_fixup(downloader == 'HlsFD', 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
2810                     ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
2811                     ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
2812
2813                 fixup()
2814                 try:
2815                     info_dict = self.post_process(dl_filename, info_dict, files_to_move)
2816                 except PostProcessingError as err:
2817                     self.report_error('Postprocessing: %s' % str(err))
2818                     return
2819                 try:
2820                     for ph in self._post_hooks:
2821                         ph(info_dict['filepath'])
2822                 except Exception as err:
2823                     self.report_error('post hooks: %s' % str(err))
2824                     return
2825                 must_record_download_archive = True
2826
2827         if must_record_download_archive or self.params.get('force_write_download_archive', False):
2828             self.record_download_archive(info_dict)
2829         max_downloads = self.params.get('max_downloads')
2830         if max_downloads is not None and self._num_downloads >= int(max_downloads):
2831             raise MaxDownloadsReached()
2832
2833     def download(self, url_list):
2834         """Download a given list of URLs."""
2835         outtmpl = self.outtmpl_dict['default']
2836         if (len(url_list) > 1
2837                 and outtmpl != '-'
2838                 and '%' not in outtmpl
2839                 and self.params.get('max_downloads') != 1):
2840             raise SameFileError(outtmpl)
2841
2842         for url in url_list:
2843             try:
2844                 # It also downloads the videos
2845                 res = self.extract_info(
2846                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2847             except UnavailableVideoError:
2848                 self.report_error('unable to download video')
2849             except MaxDownloadsReached:
2850                 self.to_screen('[info] Maximum number of downloaded files reached')
2851                 raise
2852             except ExistingVideoReached:
2853                 self.to_screen('[info] Encountered a file that is already in the archive, stopping due to --break-on-existing')
2854                 raise
2855             except RejectedVideoReached:
2856                 self.to_screen('[info] Encountered a file that did not match filter, stopping due to --break-on-reject')
2857                 raise
2858             else:
2859                 if self.params.get('dump_single_json', False):
2860                     self.post_extract(res)
2861                     self.to_stdout(json.dumps(res, default=repr))
2862
2863         return self._download_retcode
2864
2865     def download_with_info_file(self, info_filename):
2866         with contextlib.closing(fileinput.FileInput(
2867                 [info_filename], mode='r',
2868                 openhook=fileinput.hook_encoded('utf-8'))) as f:
2869             # FileInput doesn't have a read method, we can't call json.load
2870             info = self.filter_requested_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
2871         try:
2872             self.process_ie_result(info, download=True)
2873         except (DownloadError, EntryNotInPlaylist, ThrottledDownload):
2874             webpage_url = info.get('webpage_url')
2875             if webpage_url is not None:
2876                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2877                 return self.download([webpage_url])
2878             else:
2879                 raise
2880         return self._download_retcode
2881
2882     @staticmethod
2883     def filter_requested_info(info_dict, actually_filter=True):
2884         remove_keys = ['__original_infodict']  # Always remove this since this may contain a copy of the entire dict
2885         keep_keys = ['_type'],  # Always keep this to facilitate load-info-json
2886         if actually_filter:
2887             remove_keys += ('requested_formats', 'requested_subtitles', 'requested_entries', 'filepath', 'entries', 'original_url')
2888             empty_values = (None, {}, [], set(), tuple())
2889             reject = lambda k, v: k not in keep_keys and (
2890                 k.startswith('_') or k in remove_keys or v in empty_values)
2891         else:
2892             info_dict['epoch'] = int(time.time())
2893             reject = lambda k, v: k in remove_keys
2894         filter_fn = lambda obj: (
2895             list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
2896             else obj if not isinstance(obj, dict)
2897             else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
2898         return filter_fn(info_dict)
2899
2900     def run_pp(self, pp, infodict):
2901         files_to_delete = []
2902         if '__files_to_move' not in infodict:
2903             infodict['__files_to_move'] = {}
2904         files_to_delete, infodict = pp.run(infodict)
2905         if not files_to_delete:
2906             return infodict
2907
2908         if self.params.get('keepvideo', False):
2909             for f in files_to_delete:
2910                 infodict['__files_to_move'].setdefault(f, '')
2911         else:
2912             for old_filename in set(files_to_delete):
2913                 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2914                 try:
2915                     os.remove(encodeFilename(old_filename))
2916                 except (IOError, OSError):
2917                     self.report_warning('Unable to remove downloaded original file')
2918                 if old_filename in infodict['__files_to_move']:
2919                     del infodict['__files_to_move'][old_filename]
2920         return infodict
2921
2922     @staticmethod
2923     def post_extract(info_dict):
2924         def actual_post_extract(info_dict):
2925             if info_dict.get('_type') in ('playlist', 'multi_video'):
2926                 for video_dict in info_dict.get('entries', {}):
2927                     actual_post_extract(video_dict or {})
2928                 return
2929
2930             post_extractor = info_dict.get('__post_extractor') or (lambda: {})
2931             extra = post_extractor().items()
2932             info_dict.update(extra)
2933             info_dict.pop('__post_extractor', None)
2934
2935             original_infodict = info_dict.get('__original_infodict') or {}
2936             original_infodict.update(extra)
2937             original_infodict.pop('__post_extractor', None)
2938
2939         actual_post_extract(info_dict or {})
2940
2941     def pre_process(self, ie_info, key='pre_process', files_to_move=None):
2942         info = dict(ie_info)
2943         info['__files_to_move'] = files_to_move or {}
2944         for pp in self._pps[key]:
2945             info = self.run_pp(pp, info)
2946         return info, info.pop('__files_to_move', None)
2947
2948     def post_process(self, filename, ie_info, files_to_move=None):
2949         """Run all the postprocessors on the given file."""
2950         info = dict(ie_info)
2951         info['filepath'] = filename
2952         info['__files_to_move'] = files_to_move or {}
2953
2954         for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
2955             info = self.run_pp(pp, info)
2956         info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
2957         del info['__files_to_move']
2958         for pp in self._pps['after_move']:
2959             info = self.run_pp(pp, info)
2960         return info
2961
2962     def _make_archive_id(self, info_dict):
2963         video_id = info_dict.get('id')
2964         if not video_id:
2965             return
2966         # Future-proof against any change in case
2967         # and backwards compatibility with prior versions
2968         extractor = info_dict.get('extractor_key') or info_dict.get('ie_key')  # key in a playlist
2969         if extractor is None:
2970             url = str_or_none(info_dict.get('url'))
2971             if not url:
2972                 return
2973             # Try to find matching extractor for the URL and take its ie_key
2974             for ie in self._ies:
2975                 if ie.suitable(url):
2976                     extractor = ie.ie_key()
2977                     break
2978             else:
2979                 return
2980         return '%s %s' % (extractor.lower(), video_id)
2981
2982     def in_download_archive(self, info_dict):
2983         fn = self.params.get('download_archive')
2984         if fn is None:
2985             return False
2986
2987         vid_id = self._make_archive_id(info_dict)
2988         if not vid_id:
2989             return False  # Incomplete video information
2990
2991         return vid_id in self.archive
2992
2993     def record_download_archive(self, info_dict):
2994         fn = self.params.get('download_archive')
2995         if fn is None:
2996             return
2997         vid_id = self._make_archive_id(info_dict)
2998         assert vid_id
2999         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3000             archive_file.write(vid_id + '\n')
3001         self.archive.add(vid_id)
3002
3003     @staticmethod
3004     def format_resolution(format, default='unknown'):
3005         if format.get('vcodec') == 'none':
3006             if format.get('acodec') == 'none':
3007                 return 'images'
3008             return 'audio only'
3009         if format.get('resolution') is not None:
3010             return format['resolution']
3011         if format.get('width') and format.get('height'):
3012             res = '%dx%d' % (format['width'], format['height'])
3013         elif format.get('height'):
3014             res = '%sp' % format['height']
3015         elif format.get('width'):
3016             res = '%dx?' % format['width']
3017         else:
3018             res = default
3019         return res
3020
3021     def _format_note(self, fdict):
3022         res = ''
3023         if fdict.get('ext') in ['f4f', 'f4m']:
3024             res += '(unsupported) '
3025         if fdict.get('language'):
3026             if res:
3027                 res += ' '
3028             res += '[%s] ' % fdict['language']
3029         if fdict.get('format_note') is not None:
3030             res += fdict['format_note'] + ' '
3031         if fdict.get('tbr') is not None:
3032             res += '%4dk ' % fdict['tbr']
3033         if fdict.get('container') is not None:
3034             if res:
3035                 res += ', '
3036             res += '%s container' % fdict['container']
3037         if (fdict.get('vcodec') is not None
3038                 and fdict.get('vcodec') != 'none'):
3039             if res:
3040                 res += ', '
3041             res += fdict['vcodec']
3042             if fdict.get('vbr') is not None:
3043                 res += '@'
3044         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3045             res += 'video@'
3046         if fdict.get('vbr') is not None:
3047             res += '%4dk' % fdict['vbr']
3048         if fdict.get('fps') is not None:
3049             if res:
3050                 res += ', '
3051             res += '%sfps' % fdict['fps']
3052         if fdict.get('acodec') is not None:
3053             if res:
3054                 res += ', '
3055             if fdict['acodec'] == 'none':
3056                 res += 'video only'
3057             else:
3058                 res += '%-5s' % fdict['acodec']
3059         elif fdict.get('abr') is not None:
3060             if res:
3061                 res += ', '
3062             res += 'audio'
3063         if fdict.get('abr') is not None:
3064             res += '@%3dk' % fdict['abr']
3065         if fdict.get('asr') is not None:
3066             res += ' (%5dHz)' % fdict['asr']
3067         if fdict.get('filesize') is not None:
3068             if res:
3069                 res += ', '
3070             res += format_bytes(fdict['filesize'])
3071         elif fdict.get('filesize_approx') is not None:
3072             if res:
3073                 res += ', '
3074             res += '~' + format_bytes(fdict['filesize_approx'])
3075         return res
3076
3077     def list_formats(self, info_dict):
3078         formats = info_dict.get('formats', [info_dict])
3079         new_format = (
3080             'list-formats' not in self.params.get('compat_opts', [])
3081             and self.params.get('listformats_table', True) is not False)
3082         if new_format:
3083             table = [
3084                 [
3085                     format_field(f, 'format_id'),
3086                     format_field(f, 'ext'),
3087                     self.format_resolution(f),
3088                     format_field(f, 'fps', '%d'),
3089                     '|',
3090                     format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
3091                     format_field(f, 'tbr', '%4dk'),
3092                     shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
3093                     '|',
3094                     format_field(f, 'vcodec', default='unknown').replace('none', ''),
3095                     format_field(f, 'vbr', '%4dk'),
3096                     format_field(f, 'acodec', default='unknown').replace('none', ''),
3097                     format_field(f, 'abr', '%3dk'),
3098                     format_field(f, 'asr', '%5dHz'),
3099                     ', '.join(filter(None, (
3100                         'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
3101                         format_field(f, 'language', '[%s]'),
3102                         format_field(f, 'format_note'),
3103                         format_field(f, 'container', ignore=(None, f.get('ext'))),
3104                     ))),
3105                 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3106             header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', '  TBR', 'PROTO',
3107                            '|', 'VCODEC', '  VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO']
3108         else:
3109             table = [
3110                 [
3111                     format_field(f, 'format_id'),
3112                     format_field(f, 'ext'),
3113                     self.format_resolution(f),
3114                     self._format_note(f)]
3115                 for f in formats
3116                 if f.get('preference') is None or f['preference'] >= -1000]
3117             header_line = ['format code', 'extension', 'resolution', 'note']
3118
3119         self.to_screen(
3120             '[info] Available formats for %s:' % info_dict['id'])
3121         self.to_stdout(render_table(
3122             header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format))
3123
3124     def list_thumbnails(self, info_dict):
3125         thumbnails = list(info_dict.get('thumbnails'))
3126         if not thumbnails:
3127             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3128             return
3129
3130         self.to_screen(
3131             '[info] Thumbnails for %s:' % info_dict['id'])
3132         self.to_stdout(render_table(
3133             ['ID', 'width', 'height', 'URL'],
3134             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3135
3136     def list_subtitles(self, video_id, subtitles, name='subtitles'):
3137         if not subtitles:
3138             self.to_screen('%s has no %s' % (video_id, name))
3139             return
3140         self.to_screen(
3141             'Available %s for %s:' % (name, video_id))
3142
3143         def _row(lang, formats):
3144             exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3145             if len(set(names)) == 1:
3146                 names = [] if names[0] == 'unknown' else names[:1]
3147             return [lang, ', '.join(names), ', '.join(exts)]
3148
3149         self.to_stdout(render_table(
3150             ['Language', 'Name', 'Formats'],
3151             [_row(lang, formats) for lang, formats in subtitles.items()],
3152             hideEmpty=True))
3153
3154     def urlopen(self, req):
3155         """ Start an HTTP download """
3156         if isinstance(req, compat_basestring):
3157             req = sanitized_Request(req)
3158         return self._opener.open(req, timeout=self._socket_timeout)
3159
3160     def print_debug_header(self):
3161         if not self.params.get('verbose'):
3162             return
3163
3164         if type('') is not compat_str:
3165             # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
3166             self.report_warning(
3167                 'Your Python is broken! Update to a newer and supported version')
3168
3169         stdout_encoding = getattr(
3170             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
3171         encoding_str = (
3172             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
3173                 locale.getpreferredencoding(),
3174                 sys.getfilesystemencoding(),
3175                 stdout_encoding,
3176                 self.get_encoding()))
3177         write_string(encoding_str, encoding=None)
3178
3179         source = (
3180             '(exe)' if hasattr(sys, 'frozen')
3181             else '(zip)' if isinstance(globals().get('__loader__'), zipimporter)
3182             else '(source)' if os.path.basename(sys.argv[0]) == '__main__.py'
3183             else '')
3184         self._write_string('[debug] yt-dlp version %s %s\n' % (__version__, source))
3185         if _LAZY_LOADER:
3186             self._write_string('[debug] Lazy loading extractors enabled\n')
3187         if _PLUGIN_CLASSES:
3188             self._write_string(
3189                 '[debug] Plugin Extractors: %s\n' % [ie.ie_key() for ie in _PLUGIN_CLASSES])
3190         if self.params.get('compat_opts'):
3191             self._write_string(
3192                 '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts')))
3193         try:
3194             sp = subprocess.Popen(
3195                 ['git', 'rev-parse', '--short', 'HEAD'],
3196                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3197                 cwd=os.path.dirname(os.path.abspath(__file__)))
3198             out, err = process_communicate_or_kill(sp)
3199             out = out.decode().strip()
3200             if re.match('[0-9a-f]+', out):
3201                 self._write_string('[debug] Git HEAD: %s\n' % out)
3202         except Exception:
3203             try:
3204                 sys.exc_clear()
3205             except Exception:
3206                 pass
3207
3208         def python_implementation():
3209             impl_name = platform.python_implementation()
3210             if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3211                 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3212             return impl_name
3213
3214         self._write_string('[debug] Python version %s (%s %s) - %s\n' % (
3215             platform.python_version(),
3216             python_implementation(),
3217             platform.architecture()[0],
3218             platform_name()))
3219
3220         exe_versions = FFmpegPostProcessor.get_versions(self)
3221         exe_versions['rtmpdump'] = rtmpdump_version()
3222         exe_versions['phantomjs'] = PhantomJSwrapper._version()
3223         exe_str = ', '.join(
3224             '%s %s' % (exe, v)
3225             for exe, v in sorted(exe_versions.items())
3226             if v
3227         )
3228         if not exe_str:
3229             exe_str = 'none'
3230         self._write_string('[debug] exe versions: %s\n' % exe_str)
3231
3232         proxy_map = {}
3233         for handler in self._opener.handlers:
3234             if hasattr(handler, 'proxies'):
3235                 proxy_map.update(handler.proxies)
3236         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
3237
3238         if self.params.get('call_home', False):
3239             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3240             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
3241             return
3242             latest_version = self.urlopen(
3243                 'https://yt-dl.org/latest/version').read().decode('utf-8')
3244             if version_tuple(latest_version) > version_tuple(__version__):
3245                 self.report_warning(
3246                     'You are using an outdated version (newest version: %s)! '
3247                     'See https://yt-dl.org/update if you need help updating.' %
3248                     latest_version)
3249
3250     def _setup_opener(self):
3251         timeout_val = self.params.get('socket_timeout')
3252         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
3253
3254         opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3255         opts_cookiefile = self.params.get('cookiefile')
3256         opts_proxy = self.params.get('proxy')
3257
3258         self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3259
3260         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3261         if opts_proxy is not None:
3262             if opts_proxy == '':
3263                 proxies = {}
3264             else:
3265                 proxies = {'http': opts_proxy, 'https': opts_proxy}
3266         else:
3267             proxies = compat_urllib_request.getproxies()
3268             # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3269             if 'http' in proxies and 'https' not in proxies:
3270                 proxies['https'] = proxies['http']
3271         proxy_handler = PerRequestProxyHandler(proxies)
3272
3273         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3274         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3275         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3276         redirect_handler = YoutubeDLRedirectHandler()
3277         data_handler = compat_urllib_request_DataHandler()
3278
3279         # When passing our own FileHandler instance, build_opener won't add the
3280         # default FileHandler and allows us to disable the file protocol, which
3281         # can be used for malicious purposes (see
3282         # https://github.com/ytdl-org/youtube-dl/issues/8227)
3283         file_handler = compat_urllib_request.FileHandler()
3284
3285         def file_open(*args, **kwargs):
3286             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3287         file_handler.file_open = file_open
3288
3289         opener = compat_urllib_request.build_opener(
3290             proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3291
3292         # Delete the default user-agent header, which would otherwise apply in
3293         # cases where our custom HTTP handler doesn't come into play
3294         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3295         opener.addheaders = []
3296         self._opener = opener
3297
3298     def encode(self, s):
3299         if isinstance(s, bytes):
3300             return s  # Already encoded
3301
3302         try:
3303             return s.encode(self.get_encoding())
3304         except UnicodeEncodeError as err:
3305             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3306             raise
3307
3308     def get_encoding(self):
3309         encoding = self.params.get('encoding')
3310         if encoding is None:
3311             encoding = preferredencoding()
3312         return encoding
3313
3314     def _write_thumbnails(self, info_dict, filename):  # return the extensions
3315         write_all = self.params.get('write_all_thumbnails', False)
3316         thumbnails = []
3317         if write_all or self.params.get('writethumbnail', False):
3318             thumbnails = info_dict.get('thumbnails') or []
3319         multiple = write_all and len(thumbnails) > 1
3320
3321         ret = []
3322         for t in thumbnails[::-1]:
3323             thumb_ext = determine_ext(t['url'], 'jpg')
3324             suffix = '%s.' % t['id'] if multiple else ''
3325             thumb_display_id = '%s ' % t['id'] if multiple else ''
3326             thumb_filename = replace_extension(filename, suffix + thumb_ext, info_dict.get('ext'))
3327
3328             if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(thumb_filename)):
3329                 ret.append(suffix + thumb_ext)
3330                 t['filepath'] = thumb_filename
3331                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
3332                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
3333             else:
3334                 self.to_screen('[%s] %s: Downloading thumbnail %s ...' %
3335                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
3336                 try:
3337                     uf = self.urlopen(t['url'])
3338                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3339                         shutil.copyfileobj(uf, thumbf)
3340                     ret.append(suffix + thumb_ext)
3341                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
3342                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
3343                     t['filepath'] = thumb_filename
3344                 except network_exceptions as err:
3345                     self.report_warning('Unable to download thumbnail "%s": %s' %
3346                                         (t['url'], error_to_compat_str(err)))
3347             if ret and not write_all:
3348                 break
3349         return ret