X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/b0249bcaf0f2ac1fafecbf5d44f7403c6f0d5850..3ae5e7977439193519c0ea62eba3aa3111c5571b:/yt_dlp/YoutubeDL.py diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index b1bc05a80..873c22ad6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -25,23 +25,25 @@ import tokenize import traceback import random +import unicodedata from string import ascii_letters -from zipimport import zipimporter from .compat import ( compat_basestring, - compat_cookiejar, compat_get_terminal_size, compat_kwargs, compat_numeric_types, compat_os_name, + compat_pycrypto_AES, + compat_shlex_quote, compat_str, compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, compat_urllib_request_DataHandler, ) +from .cookies import load_cookies from .utils import ( age_restricted, args_to_str, @@ -65,7 +67,8 @@ float_or_none, format_bytes, format_field, - STR_FORMAT_RE, + STR_FORMAT_RE_TMPL, + STR_FORMAT_TYPES, formatSeconds, GeoRestrictedError, HEADRequest, @@ -101,14 +104,16 @@ str_or_none, strftime_or_none, subtitles_filename, + ThrottledDownload, to_high_limit_path, traverse_obj, + try_get, UnavailableVideoError, url_basename, + variadic, version_tuple, write_json_file, write_string, - YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, YoutubeDLRedirectHandler, @@ -118,24 +123,28 @@ gen_extractor_classes, get_info_extractor, _LAZY_LOADER, - _PLUGIN_CLASSES + _PLUGIN_CLASSES as plugin_extractors ) from .extractor.openload import PhantomJSwrapper from .downloader import ( + FFmpegFD, get_suitable_downloader, shorten_protocol_name ) from .downloader.rtmp import rtmpdump_version from .postprocessor import ( + get_postprocessor, + FFmpegFixupDurationPP, FFmpegFixupM3u8PP, FFmpegFixupM4aPP, FFmpegFixupStretchedPP, + FFmpegFixupTimestampPP, FFmpegMergerPP, FFmpegPostProcessor, - # FFmpegSubtitlesConvertorPP, - get_postprocessor, MoveFilesAfterDownloadPP, + _PLUGIN_CLASSES as plugin_postprocessors ) +from .update import detect_variant from .version import __version__ if compat_os_name == 'nt': @@ -192,7 +201,8 @@ class YoutubeDL(object): (or video) as a single JSON line. force_write_download_archive: Force writing download archive regardless of 'skip_download' or 'simulate'. - simulate: Do not download the video files. + simulate: Do not download the video files. If unset (or None), + simulate only if listsubtitles, listformats or list_thumbnails is used format: Video format code. see "FORMAT SELECTION" for more details. allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded. ignore_no_formats_error: Ignore "No video formats" error. Usefull for @@ -206,24 +216,29 @@ class YoutubeDL(object): into a single file allow_multiple_audio_streams: Allow multiple audio streams to be merged into a single file + check_formats Whether to test if the formats are downloadable. + Can be True (check all), False (check none) + or None (check only if requested by extractor) paths: Dictionary of output paths. The allowed keys are 'home' 'temp' and the keys of OUTTMPL_TYPES (in utils.py) outtmpl: Dictionary of templates for output names. Allowed keys are 'default' and the keys of OUTTMPL_TYPES (in utils.py). - A string a also accepted for backward compatibility + For compatibility with youtube-dl, a single string can also be used outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names trim_file_name: Limit length of filename (extension excluded) windowsfilenames: Force the filenames to be windows compatible - ignoreerrors: Do not stop on download errors - (Default True when running yt-dlp, - but False when directly accessing YoutubeDL class) + ignoreerrors: Do not stop on download/postprocessing errors. + Can be 'only_download' to ignore only download errors. + Default is 'only_download' for CLI, but False for API skip_playlist_after_errors: Number of allowed failures until the rest of the playlist is skipped force_generic_extractor: Force downloader to use the generic extractor overwrites: Overwrite all video and metadata files if True, overwrite only non-video files if None and don't overwrite any file if False + For compatibility with youtube-dl, + "nooverwrites" may also be used instead playliststart: Playlist item to start at. playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. @@ -236,7 +251,7 @@ class YoutubeDL(object): writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file clean_infojson: Remove private fields from the infojson - writecomments: Extract video comments. This will not be written to disk + getcomments: Extract video comments. This will not be written to disk unless writeinfojson is also given writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file @@ -250,7 +265,7 @@ class YoutubeDL(object): writedesktoplink: Write a Linux internet shortcut file (.desktop) writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file - allsubtitles: Deprecated - Use subtitlelangs = ['all'] + allsubtitles: Deprecated - Use subtitleslangs = ['all'] Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video @@ -284,6 +299,9 @@ class YoutubeDL(object): break_on_reject: Stop the download process when encountering a video that has been filtered out. cookiefile: File name where cookies should be read from and dumped to + cookiesfrombrowser: A tuple containing the name of the browser and the profile + name/path from where cookies are loaded. + Eg: ('chrome', ) or (vivaldi, 'default') nocheckcertificate:Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. @@ -314,6 +332,7 @@ class YoutubeDL(object): progress, with a dictionary with the entries * status: One of "downloading", "error", or "finished". Check this first and ignore unknown values. + * info_dict: The extracted info_dict If status is one of "downloading", or "finished", the following properties may also be present: @@ -389,17 +408,16 @@ class YoutubeDL(object): if True, otherwise use ffmpeg/avconv if False, otherwise use downloader suggested by extractor if None. compat_opts: Compatibility options. See "Differences in default behavior". - Note that only format-sort, format-spec, no-live-chat, - no-attach-info-json, playlist-index, list-formats, - no-direct-merge, no-youtube-channel-redirect, - and no-youtube-unavailable-videos works when used via the API + The following options do not work when used through the API: + filename, abort-on-error, multistreams, no-live-chat, + no-clean-infojson, no-playlist-metafiles, no-keep-subs. + Refer __init__.py for their implementation The following parameters are not used by YoutubeDL itself, they are used by the downloader (see yt_dlp/downloader/common.py): - nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, - noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args, hls_use_mpegts, - http_chunk_size. + nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, + max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, + xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size. The following options are used by the post processors: prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, @@ -407,21 +425,28 @@ class YoutubeDL(object): ffmpeg_location: Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. postprocessor_args: A dictionary of postprocessor/executable keys (in lower case) - and a list of additional command-line arguments for the - postprocessor/executable. The dict can also have "PP+EXE" keys - which are used when the given exe is used by the given PP. - Use 'default' as the name for arguments to passed to all PP + and a list of additional command-line arguments for the + postprocessor/executable. The dict can also have "PP+EXE" keys + which are used when the given exe is used by the given PP. + Use 'default' as the name for arguments to passed to all PP + For compatibility with youtube-dl, a single list of args + can also be used The following options are used by the extractors: extractor_retries: Number of times to retry for known errors dynamic_mpd: Whether to process dynamic DASH manifests (default: True) hls_split_discontinuity: Split HLS playlists to different formats at discontinuities such as ad breaks (default: False) - youtube_include_dash_manifest: If True (default), DASH manifests and related + extractor_args: A dictionary of arguments to be passed to the extractors. + See "EXTRACTOR ARGUMENTS" for details. + Eg: {'youtube': {'skip': ['dash', 'hls']}} + youtube_include_dash_manifest: Deprecated - Use extractor_args instead. + If True (default), DASH manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about DASH. (only for youtube) - youtube_include_hls_manifest: If True (default), HLS manifests and related + youtube_include_hls_manifest: Deprecated - Use extractor_args instead. + If True (default), HLS manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) @@ -439,9 +464,9 @@ class YoutubeDL(object): )) params = None - _ies = [] + _ies = {} _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []} - __prepare_filename_warned = False + _printed_messages = set() _first_webpage_request = True _download_retcode = None _num_downloads = None @@ -453,10 +478,10 @@ def __init__(self, params=None, auto_init=True): """Create a FileDownloader object with the given options.""" if params is None: params = {} - self._ies = [] + self._ies = {} self._ies_instances = {} self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []} - self.__prepare_filename_warned = False + self._printed_messages = set() self._first_webpage_request = True self._post_hooks = [] self._progress_hooks = [] @@ -475,6 +500,12 @@ def __init__(self, params=None, auto_init=True): self.report_warning( 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2]) + if self.params.get('allow_unplayable_formats'): + self.report_warning( + 'You have asked for unplayable formats to be listed/downloaded. ' + 'This is a developer option intended for debugging. ' + 'If you experience any issues while using this option, DO NOT open a bug report') + def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion)) @@ -492,13 +523,15 @@ def check_deprecated(param, option, suggestion): for msg in self.params.get('warnings', []): self.report_warning(msg) - if self.params.get('final_ext'): - if self.params.get('merge_output_format'): - self.report_warning('--merge-output-format will be ignored since --remux-video or --recode-video is given') - self.params['merge_output_format'] = self.params['final_ext'] - - if 'overwrites' in self.params and self.params['overwrites'] is None: - del self.params['overwrites'] + if self.params.get('overwrites') is None: + self.params.pop('overwrites', None) + elif self.params.get('nooverwrites') is not None: + # nooverwrites was unnecessarily changed to overwrites + # in 0c3d0f51778b153f65c21906031c2e091fcfb641 + # This ensures compatibility with both keys + self.params['overwrites'] = not self.params['nooverwrites'] + else: + self.params['nooverwrites'] = not self.params['overwrites'] if params.get('bidi_workaround', False): try: @@ -569,14 +602,9 @@ def preload_download_archive(fn): self.add_default_info_extractors() for pp_def_raw in self.params.get('postprocessors', []): - pp_class = get_postprocessor(pp_def_raw['key']) pp_def = dict(pp_def_raw) - del pp_def['key'] - if 'when' in pp_def: - when = pp_def['when'] - del pp_def['when'] - else: - when = 'post_process' + when = pp_def.pop('when', 'post_process') + pp_class = get_postprocessor(pp_def.pop('key')) pp = pp_class(self, **compat_kwargs(pp_def)) self.add_post_processor(pp, when=when) @@ -606,11 +634,19 @@ def warn_if_short_id(self, argv): def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" - self._ies.append(ie) + ie_key = ie.ie_key() + self._ies[ie_key] = ie if not isinstance(ie, type): - self._ies_instances[ie.ie_key()] = ie + self._ies_instances[ie_key] = ie ie.set_downloader(self) + def _get_info_extractor_class(self, ie_key): + ie = self._ies.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key) + self.add_info_extractor(ie) + return ie + def get_info_extractor(self, ie_key): """ Get an instance of an IE with name ie_key, it will try to get one from @@ -656,8 +692,12 @@ def _bidi_workaround(self, message): for _ in range(line_count)) return res[:-len('\n')] - def _write_string(self, s, out=None): - write_string(s, out=out, encoding=self.params.get('encoding')) + def _write_string(self, message, out=None, only_once=False): + if only_once: + if message in self._printed_messages: + return + self._printed_messages.add(message) + write_string(message, out=out, encoding=self.params.get('encoding')) def to_stdout(self, message, skip_eol=False, quiet=False): """Print message to stdout""" @@ -668,13 +708,13 @@ def to_stdout(self, message, skip_eol=False, quiet=False): '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), self._err_file if quiet else self._screen_file) - def to_stderr(self, message): + def to_stderr(self, message, only_once=False): """Print message to stderr""" assert isinstance(message, compat_str) if self.params.get('logger'): self.params['logger'].error(message) else: - self._write_string('%s\n' % self._bidi_workaround(message), self._err_file) + self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once) def to_console_title(self, message): if not self.params.get('consoletitle', False): @@ -690,7 +730,7 @@ def to_console_title(self, message): def save_console_title(self): if not self.params.get('consoletitle', False): return - if self.params.get('simulate', False): + if self.params.get('simulate'): return if compat_os_name != 'nt' and 'TERM' in os.environ: # Save the title on stack @@ -699,7 +739,7 @@ def save_console_title(self): def restore_console_title(self): if not self.params.get('consoletitle', False): return - if self.params.get('simulate', False): + if self.params.get('simulate'): return if compat_os_name != 'nt' and 'TERM' in os.environ: # Restore the title from stack @@ -738,7 +778,7 @@ def trouble(self, message=None, tb=None): tb = ''.join(tb_data) if tb: self.to_stderr(tb) - if not self.params.get('ignoreerrors', False): + if not self.params.get('ignoreerrors'): if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: exc_info = sys.exc_info()[1].exc_info else: @@ -751,7 +791,7 @@ def to_screen(self, message, skip_eol=False): self.to_stdout( message, skip_eol, quiet=self.params.get('quiet', False)) - def report_warning(self, message): + def report_warning(self, message, only_once=False): ''' Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored @@ -766,7 +806,7 @@ def report_warning(self, message): else: _msg_header = 'WARNING:' warning_message = '%s %s' % (_msg_header, message) - self.to_stderr(warning_message) + self.to_stderr(warning_message, only_once) def report_error(self, message, tb=None): ''' @@ -780,7 +820,7 @@ def report_error(self, message, tb=None): error_message = '%s %s' % (_msg_header, message) self.trouble(error_message, tb) - def write_debug(self, message): + def write_debug(self, message, only_once=False): '''Log debug message or Print message to stderr''' if not self.params.get('verbose', False): return @@ -788,7 +828,7 @@ def write_debug(self, message): if self.params.get('logger'): self.params['logger'].debug(message) else: - self._write_string('%s\n' % message) + self.to_stderr(message, only_once) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" @@ -804,13 +844,23 @@ def report_file_delete(self, file_name): except UnicodeEncodeError: self.to_screen('Deleting existing file') + def raise_no_formats(self, info, forced=False): + has_drm = info.get('__has_drm') + msg = 'This video is DRM protected' if has_drm else 'No video formats found!' + expected = self.params.get('ignore_no_formats_error') + if forced or not expected: + raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'], + expected=has_drm or expected) + else: + self.report_warning(msg) + def parse_outtmpl(self): outtmpl_dict = self.params.get('outtmpl', {}) if not isinstance(outtmpl_dict, dict): outtmpl_dict = {'default': outtmpl_dict} outtmpl_dict.update({ k: v for k, v in DEFAULT_OUTTMPL.items() - if not outtmpl_dict.get(k)}) + if outtmpl_dict.get(k) is None}) for key, val in outtmpl_dict.items(): if isinstance(val, bytes): self.report_warning( @@ -834,28 +884,52 @@ def get_output_path(self, dir_type='', filename=None): return sanitize_path(path, force=self.params.get('windowsfilenames')) @staticmethod - def validate_outtmpl(tmpl): + def _outtmpl_expandpath(outtmpl): + # expand_path translates '%%' into '%' and '$$' into '$' + # correspondingly that is not what we want since we need to keep + # '%%' intact for template dict substitution step. Working around + # with boundary-alike separator hack. + sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) + outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) + + # outtmpl should be expand_path'ed before template dict substitution + # because meta fields may contain env variables we don't want to + # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and + # title "Hello $PATH", we don't want `$PATH` to be expanded. + return expand_path(outtmpl).replace(sep, '') + + @staticmethod + def escape_outtmpl(outtmpl): + ''' Escape any remaining strings like %s, %abc% etc. ''' + return re.sub( + STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'), + lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0), + outtmpl) + + @classmethod + def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' + outtmpl = re.sub( + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'), + lambda mobj: f'{mobj.group(0)[:-1]}s', + cls._outtmpl_expandpath(outtmpl)) try: - re.sub( - STR_FORMAT_RE.format(''), - lambda mobj: ('%' if not mobj.group('has_key') else '') + mobj.group(0), - tmpl - ) % collections.defaultdict(int) + cls.escape_outtmpl(outtmpl) % collections.defaultdict(int) return None except ValueError as err: return err def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): - """ Make the template and info_dict suitable for substitution (outtmpl % info_dict)""" - info_dict = dict(info_dict) - na = self.params.get('outtmpl_na_placeholder', 'NA') + """ Make the template and info_dict suitable for substitution : ydl.outtmpl_escape(outtmpl) % info_dict """ + info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set + info_dict = dict(info_dict) # Do not sanitize so as not to consume LazyList + for key in ('__original_infodict', '__postprocessors'): + info_dict.pop(key, None) info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs formatSeconds(info_dict['duration'], '-' if sanitize else ':') if info_dict.get('duration', None) is not None else None) - info_dict['epoch'] = int(time.time()) info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads if info_dict.get('resolution') is None: info_dict['resolution'] = self.format_resolution(info_dict, default=None) @@ -868,14 +942,14 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE.format('[^)]*')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, } # Field is of the form key1.key2... # where keys (except first) can be string, int or slice - FIELD_RE = r'\w+(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') + FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) INTERNAL_FORMAT_RE = re.compile(r'''(?x) @@ -883,15 +957,19 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): (?P{field}) (?P(?:{math_op}{math_field})*) (?:>(?P.+?))? + (?P(?.*?))? $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) - get_key = lambda k: traverse_obj( - info_dict, k.split('.'), is_user_input=True, traverse_string=True) + def _traverse_infodict(k): + k = k.split('.') + if k[0] == '': + k.pop(0) + return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True) def get_value(mdict): # Object traversal - value = get_key(mdict['fields']) + value = _traverse_infodict(mdict['fields']) # Negative if mdict['negate']: value = float_or_none(value) @@ -913,7 +991,7 @@ def get_value(mdict): item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1) offset = float_or_none(item) if offset is None: - offset = float_or_none(get_key(item)) + offset = float_or_none(_traverse_infodict(item)) try: value = operator(value, multiplier * offset) except (TypeError, ZeroDivisionError): @@ -921,49 +999,77 @@ def get_value(mdict): operator = None # Datetime formatting if mdict['strf_format']: - value = strftime_or_none(value, mdict['strf_format']) + value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ',')) return value + na = self.params.get('outtmpl_na_placeholder', 'NA') + + def _dumpjson_default(obj): + if isinstance(obj, (set, LazyList)): + return list(obj) + raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable') + def create_key(outer_mobj): if not outer_mobj.group('has_key'): - return '%{}'.format(outer_mobj.group(0)) - + return f'%{outer_mobj.group(0)}' key = outer_mobj.group('key') - fmt = outer_mobj.group('format') mobj = re.match(INTERNAL_FORMAT_RE, key) - if mobj is None: - value, default = None, na - else: + initial_field = mobj.group('fields').split('.')[-1] if mobj else '' + value, default = None, na + while mobj: mobj = mobj.groupdict() - default = mobj['default'] if mobj['default'] is not None else na + default = mobj['default'] if mobj['default'] is not None else default value = get_value(mobj) + if value is None and mobj['alternate']: + mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:]) + else: + break + fmt = outer_mobj.group('format') if fmt == 's' and value is not None and key in field_size_compat_map.keys(): fmt = '0{:d}d'.format(field_size_compat_map[key]) value = default if value is None else value - key += '\0%s' % fmt - if fmt == 'c': - value = compat_str(value) - if value is None: - value, fmt = default, 's' + str_fmt = f'{fmt[:-1]}s' + if fmt[-1] == 'l': # list + delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', ' + value, fmt = delim.join(variadic(value)), str_fmt + elif fmt[-1] == 'j': # json + value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt + elif fmt[-1] == 'q': # quoted + value, fmt = compat_shlex_quote(str(value)), str_fmt + elif fmt[-1] == 'B': # bytes + value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') + value, fmt = value.decode('utf-8', 'ignore'), 's' + elif fmt[-1] == 'U': # unicode normalized + opts = outer_mobj.group('conversion') or '' + value, fmt = unicodedata.normalize( + # "+" = compatibility equivalence, "#" = NFD + 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'), + value), str_fmt + elif fmt[-1] == 'c': + if value: + value = str(value)[0] else: - value = value[0] + fmt = str_fmt elif fmt[-1] not in 'rs': # numeric value = float_or_none(value) if value is None: value, fmt = default, 's' + if sanitize: if fmt[-1] == 'r': # If value is an object, sanitize might convert it to a string # So we convert it to repr first - value, fmt = repr(value), '%ss' % fmt[:-1] + value, fmt = repr(value), str_fmt if fmt[-1] in 'csr': - value = sanitize(key, value) + value = sanitize(initial_field, value) + + key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value - return '%({key}){fmt}'.format(key=key, fmt=fmt) + return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix')) return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT @@ -975,22 +1081,11 @@ def _prepare_filename(self, info_dict, tmpl_type='default'): is_id=(k == 'id' or k.endswith('_id'))) outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']) outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize) - - # expand_path translates '%%' into '%' and '$$' into '$' - # correspondingly that is not what we want since we need to keep - # '%%' intact for template dict substitution step. Working around - # with boundary-alike separator hack. - sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) - outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) - - # outtmpl should be expand_path'ed before template dict substitution - # because meta fields may contain env variables we don't want to - # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and - # title "Hello $PATH", we don't want `$PATH` to be expanded. - filename = expand_path(outtmpl).replace(sep, '') % template_dict + outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl)) + filename = outtmpl % template_dict force_ext = OUTTMPL_TYPES.get(tmpl_type) - if force_ext is not None: + if filename and force_ext is not None: filename = replace_extension(filename, force_ext, info_dict.get('ext')) # https://github.com/blackjack4494/youtube-dlc/issues/85 @@ -1012,15 +1107,16 @@ def prepare_filename(self, info_dict, dir_type='', warn=False): """Generate the output filename.""" filename = self._prepare_filename(info_dict, dir_type or 'default') + if not filename and dir_type not in ('', 'temp'): + return '' - if warn and not self.__prepare_filename_warned: + if warn: if not self.params.get('paths'): pass elif filename == '-': - self.report_warning('--paths is ignored when an outputting to stdout') + self.report_warning('--paths is ignored when an outputting to stdout', only_once=True) elif os.path.isabs(filename): - self.report_warning('--paths is ignored since an absolute path is given in output template') - self.__prepare_filename_warned = True + self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True) if filename == '-' or not filename: return filename @@ -1059,12 +1155,15 @@ def check_filter(): if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): return 'Skipping "%s" because it is age restricted' % video_title - if not incomplete: - match_filter = self.params.get('match_filter') - if match_filter is not None: - ret = match_filter(info_dict) - if ret is not None: - return ret + match_filter = self.params.get('match_filter') + if match_filter is not None: + try: + ret = match_filter(info_dict, incomplete=incomplete) + except TypeError: + # For backward compatibility + ret = None if incomplete else match_filter(info_dict) + if ret is not None: + return ret return None if self.in_download_archive(info_dict): @@ -1086,7 +1185,7 @@ def add_extra_info(info_dict, extra_info): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}, + def extract_info(self, url, download=True, ie_key=None, extra_info=None, process=True, force_generic_extractor=False): """ Return a list with a dictionary for each video extracted. @@ -1103,39 +1202,36 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, force_generic_extractor -- force using the generic extractor """ + if extra_info is None: + extra_info = {} + if not ie_key and force_generic_extractor: ie_key = 'Generic' if ie_key: - ies = [self.get_info_extractor(ie_key)] + ies = {ie_key: self._get_info_extractor_class(ie_key)} else: ies = self._ies - for ie in ies: + for ie_key, ie in ies.items(): if not ie.suitable(url): continue - ie_key = ie.ie_key() - ie = self.get_info_extractor(ie_key) if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') - try: - temp_id = str_or_none( - ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) - else ie._match_id(url)) - except (AssertionError, IndexError, AttributeError): - temp_id = None + temp_id = ie.get_temp_id(url) if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): self.to_screen("[%s] %s: has already been recorded in archive" % ( ie_key, temp_id)) break - return self.__extract_info(url, ie, download, extra_info, process) + return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) else: self.report_error('no suitable InfoExtractor for URL %s' % url) def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) @@ -1148,10 +1244,14 @@ def wrapper(self, *args, **kwargs): self.report_error(msg) except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached): + except ThrottledDownload: + self.to_stderr('\r') + self.report_warning('The download speed is below throttle limit. Re-extracting data') + return wrapper(self, *args, **kwargs) + except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError): raise except Exception as e: - if self.params.get('ignoreerrors', False): + if self.params.get('ignoreerrors'): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) else: raise @@ -1168,6 +1268,8 @@ def __extract_info(self, url, ie, download, extra_info, process): '_type': 'compat_list', 'entries': ie_result, } + if extra_info.get('original_url'): + ie_result.setdefault('original_url', extra_info['original_url']) self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result(ie_result, download, extra_info) @@ -1175,15 +1277,19 @@ def __extract_info(self, url, ie, download, extra_info, process): return ie_result def add_default_extra_info(self, ie_result, ie, url): - self.add_extra_info(ie_result, { - 'extractor': ie.IE_NAME, - 'webpage_url': url, - 'original_url': url, - 'webpage_url_basename': url_basename(url), - 'extractor_key': ie.ie_key(), - }) - - def process_ie_result(self, ie_result, download=True, extra_info={}): + if url is not None: + self.add_extra_info(ie_result, { + 'webpage_url': url, + 'original_url': url, + 'webpage_url_basename': url_basename(url), + }) + if ie is not None: + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'extractor_key': ie.ie_key(), + }) + + def process_ie_result(self, ie_result, download=True, extra_info=None): """ Take the result of the ie(may be modified) and resolve all unresolved references (URLs, playlist items). @@ -1191,18 +1297,27 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): It will also download the videos if 'download'. Returns the resolved ie_result. """ + if extra_info is None: + extra_info = {} result_type = ie_result.get('_type', 'video') if result_type in ('url', 'url_transparent'): ie_result['url'] = sanitize_url(ie_result['url']) + if ie_result.get('original_url'): + extra_info.setdefault('original_url', ie_result['original_url']) + extract_flat = self.params.get('extract_flat', False) if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or extract_flat is True): info_copy = ie_result.copy() + ie = try_get(ie_result.get('ie_key'), self.get_info_extractor) + if ie and not ie_result.get('id'): + info_copy['id'] = ie.get_temp_id(ie_result['url']) + self.add_default_extra_info(info_copy, ie, ie_result['url']) self.add_extra_info(info_copy, extra_info) - self.add_default_extra_info( - info_copy, self.get_info_extractor(ie_result.get('ie_key')), ie_result['url']) self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + if self.params.get('force_write_download_archive', False): + self.record_download_archive(info_copy) return ie_result if result_type == 'video': @@ -1210,7 +1325,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): ie_result = self.process_video_result(ie_result, download=download) additional_urls = (ie_result or {}).get('additional_urls') if additional_urls: - # TODO: Improve MetadataFromFieldPP to allow setting a list + # TODO: Improve MetadataParserPP to allow setting a list if isinstance(additional_urls, compat_str): additional_urls = [additional_urls] self.to_screen( @@ -1286,15 +1401,12 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): 'It needs to be updated.' % ie_result.get('extractor')) def _fixup(r): - self.add_extra_info( - r, - { - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - ) + self.add_extra_info(r, { + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + }) return r ie_result['entries'] = [ self.process_ie_result(_fixup(r), download, extra_info) @@ -1348,16 +1460,29 @@ def iter_playlistitems(format): msg = ( 'Downloading %d videos' if not isinstance(ie_entries, list) else 'Collected %d videos; downloading %%d of them' % len(ie_entries)) - if not isinstance(ie_entries, (list, PagedList)): - ie_entries = LazyList(ie_entries) + + if isinstance(ie_entries, list): + def get_entry(i): + return ie_entries[i - 1] + else: + if not isinstance(ie_entries, PagedList): + ie_entries = LazyList(ie_entries) + + def get_entry(i): + return YoutubeDL.__handle_extraction_exceptions( + lambda self, i: ie_entries[i - 1] + )(self, i) entries = [] - for i in playlistitems or itertools.count(playliststart): + items = playlistitems if playlistitems is not None else itertools.count(playliststart) + for i in items: + if i == 0: + continue if playlistitems is None and playlistend is not None and playlistend < i: break entry = None try: - entry = ie_entries[i - 1] + entry = get_entry(i) if entry is None: raise EntryNotInPlaylist() except (IndexError, EntryNotInPlaylist): @@ -1375,7 +1500,7 @@ def iter_playlistitems(format): # Save playlist_index before re-ordering entries = [ - ((playlistitems[i - 1] if playlistitems else i), entry) + ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry) for i, entry in enumerate(entries, 1) if entry is not None] n_entries = len(entries) @@ -1395,38 +1520,14 @@ def iter_playlistitems(format): } ie_copy.update(dict(ie_result)) - if self.params.get('writeinfojson', False): - infofn = self.prepare_filename(ie_copy, 'pl_infojson') - if not self._ensure_dir_exists(encodeFilename(infofn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)): - self.to_screen('[info] Playlist metadata is already present') - else: - self.to_screen('[info] Writing playlist metadata as JSON to: ' + infofn) - try: - write_json_file(self.filter_requested_info(ie_result, self.params.get('clean_infojson', True)), infofn) - except (OSError, IOError): - self.report_error('Cannot write playlist metadata to JSON file ' + infofn) - + if self._write_info_json('playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_infojson')) is None: + return + if self._write_description('playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_description')) is None: + return # TODO: This should be passed to ThumbnailsConvertor if necessary - self._write_thumbnails(ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) - - if self.params.get('writedescription', False): - descfn = self.prepare_filename(ie_copy, 'pl_description') - if not self._ensure_dir_exists(encodeFilename(descfn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)): - self.to_screen('[info] Playlist description is already present') - elif ie_result.get('description') is None: - self.report_warning('There\'s no playlist description to write.') - else: - try: - self.to_screen('[info] Writing playlist description to: ' + descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(ie_result['description']) - except (OSError, IOError): - self.report_error('Cannot write playlist description file ' + descfn) - return + self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) if self.params.get('playlistreverse', False): entries = entries[::-1] @@ -1440,8 +1541,8 @@ def iter_playlistitems(format): max_failures = self.params.get('skip_playlist_after_errors') or float('inf') for i, entry_tuple in enumerate(entries, 1): playlist_index, entry = entry_tuple - if 'playlist_index' in self.params.get('compat_options', []): - playlist_index = playlistitems[i - 1] if playlistitems else i + if 'playlist-index' in self.params.get('compat_opts', []): + playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) # This __x_forwarded_for_ip thing is a bit ugly but requires # minimal changes @@ -1552,7 +1653,7 @@ def can_merge(): return merger.available and merger.can_merge() prefer_best = ( - not self.params.get('simulate', False) + not self.params.get('simulate') and download and ( not can_merge() @@ -1691,12 +1792,16 @@ def _merge(formats_pair): formats_info.extend(format_2.get('requested_formats', (format_2,))) if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']: - get_no_more = {"video": False, "audio": False} + get_no_more = {'video': False, 'audio': False} for (i, fmt_info) in enumerate(formats_info): - for aud_vid in ["audio", "video"]: + if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none': + formats_info.pop(i) + continue + for aud_vid in ['audio', 'video']: if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none': if get_no_more[aud_vid]: formats_info.pop(i) + break get_no_more[aud_vid] = True if len(formats_info) == 1: @@ -1744,6 +1849,9 @@ def _merge(formats_pair): return new_dict def _check_formats(formats): + if not check_formats: + yield from formats + return for f in formats: self.to_screen('[info] Testing format %s' % f['format_id']) temp_file = tempfile.NamedTemporaryFile( @@ -1751,16 +1859,16 @@ def _check_formats(formats): dir=self.get_output_path('temp') or None) temp_file.close() try: - dl, _ = self.dl(temp_file.name, f, test=True) - except (ExtractorError, IOError, OSError, ValueError) + network_exceptions: - dl = False + success, _ = self.dl(temp_file.name, f, test=True) + except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + success = False finally: if os.path.exists(temp_file.name): try: os.remove(temp_file.name) except OSError: self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) - if dl: + if success: yield f else: self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) @@ -1771,8 +1879,7 @@ def _build_selector_function(selector): def selector_function(ctx): for f in fs: - for format in f(ctx): - yield format + yield from f(ctx) return selector_function elif selector.type == GROUP: # () @@ -1788,17 +1895,21 @@ def selector_function(ctx): return picked_formats return [] + elif selector.type == MERGE: # + + selector_1, selector_2 = map(_build_selector_function, selector.selector) + + def selector_function(ctx): + for pair in itertools.product( + selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))): + yield _merge(pair) + elif selector.type == SINGLE: # atom format_spec = selector.selector or 'best' # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector if format_spec == 'all': def selector_function(ctx): - formats = list(ctx['formats']) - if check_formats: - formats = _check_formats(formats) - for f in formats: - yield f + yield from _check_formats(ctx['formats']) elif format_spec == 'mergeall': def selector_function(ctx): formats = list(_check_formats(ctx['formats'])) @@ -1822,14 +1933,16 @@ def selector_function(ctx): format_modified = mobj.group('mod') is not None format_fallback = not format_type and not format_modified # for b, w - filter_f = ( + _filter_f = ( (lambda f: f.get('%scodec' % format_type) != 'none') if format_type and format_modified # bv*, ba*, wv*, wa* else (lambda f: f.get('%scodec' % not_format_type) == 'none') if format_type # bv, ba, wv, wa else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none') if not format_modified # b, w - else None) # b*, w* + else lambda f: True) # b*, w* + filter_f = lambda f: _filter_f(f) and ( + f.get('vcodec') != 'none' or f.get('acodec') != 'none') else: filter_f = ((lambda f: f.get('ext') == format_spec) if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension @@ -1837,29 +1950,17 @@ def selector_function(ctx): def selector_function(ctx): formats = list(ctx['formats']) - if not formats: - return matches = list(filter(filter_f, formats)) if filter_f is not None else formats if format_fallback and ctx['incomplete_formats'] and not matches: # for extractors with incomplete formats (audio only (soundcloud) # or video only (imgur)) best/worst will fallback to # best/worst {video,audio}-only format matches = formats - if format_reverse: - matches = matches[::-1] - if check_formats: - matches = list(itertools.islice(_check_formats(matches), format_idx)) - n = len(matches) - if -n <= format_idx - 1 < n: + matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) + try: yield matches[format_idx - 1] - - elif selector.type == MERGE: # + - selector_1, selector_2 = map(_build_selector_function, selector.selector) - - def selector_function(ctx): - for pair in itertools.product( - selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))): - yield _merge(pair) + except IndexError: + return filters = [self._build_format_filter(f) for f in selector.filters] @@ -1936,15 +2037,27 @@ def _sanitize_thumbnails(self, info_dict): t.get('id') if t.get('id') is not None else '', t.get('url'))) - def test_thumbnail(t): - self.to_screen('[info] Testing thumbnail %s' % t['id']) - try: - self.urlopen(HEADRequest(t['url'])) - except network_exceptions as err: - self.to_screen('[info] Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % ( - t['id'], t['url'], error_to_compat_str(err))) - return False - return True + def thumbnail_tester(): + if self.params.get('check_formats'): + test_all = True + to_screen = lambda msg: self.to_screen(f'[info] {msg}') + else: + test_all = False + to_screen = self.write_debug + + def test_thumbnail(t): + if not test_all and not t.get('_test_url'): + return True + to_screen('Testing thumbnail %s' % t['id']) + try: + self.urlopen(HEADRequest(t['url'])) + except network_exceptions as err: + to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % ( + t['id'], t['url'], error_to_compat_str(err))) + return False + return True + + return test_thumbnail for i, t in enumerate(thumbnails): if t.get('id') is None: @@ -1952,8 +2065,11 @@ def test_thumbnail(t): if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) t['url'] = sanitize_url(t['url']) - if self.params.get('check_formats'): - info_dict['thumbnails'] = reversed(LazyList(filter(test_thumbnail, thumbnails[::-1]))) + + if self.params.get('check_formats') is not False: + info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse() + else: + info_dict['thumbnails'] = thumbnails def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' @@ -1961,7 +2077,8 @@ def process_video_result(self, info_dict, download=True): if 'id' not in info_dict: raise ExtractorError('Missing "id" field in extractor result') if 'title' not in info_dict: - raise ExtractorError('Missing "title" field in extractor result') + raise ExtractorError('Missing "title" field in extractor result', + video_id=info_dict['id'], ie=info_dict['extractor']) def report_force_conversion(field, field_not, conversion): self.report_warning( @@ -1993,10 +2110,6 @@ def sanitize_numeric_fields(info): self._sanitize_thumbnails(info_dict) - if self.params.get('list_thumbnails'): - self.list_thumbnails(info_dict) - return - thumbnail = info_dict.get('thumbnail') thumbnails = info_dict.get('thumbnails') if thumbnail: @@ -2004,7 +2117,7 @@ def sanitize_numeric_fields(info): elif thumbnails: info_dict['thumbnail'] = thumbnails[-1]['url'] - if 'display_id' not in info_dict and 'id' in info_dict: + if info_dict.get('display_id') is None and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] for ts_key, date_key in ( @@ -2020,6 +2133,23 @@ def sanitize_numeric_fields(info): except (ValueError, OverflowError, OSError): pass + live_keys = ('is_live', 'was_live') + live_status = info_dict.get('live_status') + if live_status is None: + for key in live_keys: + if info_dict.get(key) is False: + continue + if info_dict.get(key): + live_status = key + break + if all(info_dict.get(key) is False for key in live_keys): + live_status = 'not_live' + if live_status: + info_dict['live_status'] = live_status + for key in live_keys: + if info_dict.get(key) is None: + info_dict[key] = (live_status == key) + # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. for field in ('chapter', 'season', 'episode'): @@ -2039,13 +2169,6 @@ def sanitize_numeric_fields(info): automatic_captions = info_dict.get('automatic_captions') subtitles = info_dict.get('subtitles') - if self.params.get('listsubtitles', False): - if 'automatic_captions' in info_dict: - self.list_subtitles( - info_dict['id'], automatic_captions, 'automatic captions') - self.list_subtitles(info_dict['id'], subtitles, 'subtitles') - return - info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, automatic_captions) @@ -2056,11 +2179,12 @@ def sanitize_numeric_fields(info): else: formats = info_dict['formats'] + info_dict['__has_drm'] = any(f.get('has_drm') for f in formats) + if not self.params.get('allow_unplayable_formats'): + formats = [f for f in formats if not f.get('has_drm')] + if not formats: - if not self.params.get('ignore_no_formats_error'): - raise ExtractorError('No video formats found!') - else: - self.report_warning('No video formats found!') + self.raise_no_formats(info_dict) def is_wellformed(f): url = f.get('url') @@ -2104,7 +2228,7 @@ def is_wellformed(f): format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], res=self.format_resolution(format), - note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', + note=format_field(format, 'format_note', ' (%s)'), ) # Automatically determine file extension if missing if format.get('ext') is None: @@ -2124,7 +2248,7 @@ def is_wellformed(f): # TODO Central sorting goes here - if formats and formats[0] is not info_dict: + if not formats or formats[0] is not info_dict: # only set the 'formats' fields if the original info_dict list them # otherwise we end up with a circular reference, the first (and unique) # element in the 'formats' field in info_dict is info_dict itself, @@ -2133,10 +2257,23 @@ def is_wellformed(f): info_dict, _ = self.pre_process(info_dict) + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) if self.params.get('listformats'): - if not info_dict.get('formats'): - raise ExtractorError('No video formats found', expected=True) - self.list_formats(info_dict) + if not info_dict.get('formats') and not info_dict.get('url'): + self.to_screen('%s has no formats' % info_dict['id']) + else: + self.list_formats(info_dict) + if self.params.get('listsubtitles'): + if 'automatic_captions' in info_dict: + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') + list_only = self.params.get('simulate') is None and ( + self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + if list_only: + # Without this printing, -F --print-json will not work + self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) return format_selector = self.format_selector @@ -2174,9 +2311,12 @@ def is_wellformed(f): formats_to_download = list(format_selector(ctx)) if not formats_to_download: if not self.params.get('ignore_no_formats_error'): - raise ExtractorError('Requested format is not available', expected=True) + raise ExtractorError('Requested format is not available', expected=True, + video_id=info_dict['id'], ie=info_dict['extractor']) else: self.report_warning('Requested format is not available') + # Process what we can, even without any available formats. + self.process_info(dict(info_dict)) elif download: self.to_screen( '[info] %s: Downloading %d format(s): %s' % ( @@ -2212,25 +2352,30 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions): if self.params.get('allsubtitles', False): requested_langs = all_sub_langs elif self.params.get('subtitleslangs', False): - requested_langs = set() - for lang in self.params.get('subtitleslangs'): - if lang == 'all': - requested_langs.update(all_sub_langs) + # A list is used so that the order of languages will be the same as + # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041 + requested_langs = [] + for lang_re in self.params.get('subtitleslangs'): + if lang_re == 'all': + requested_langs.extend(all_sub_langs) continue - discard = lang[0] == '-' + discard = lang_re[0] == '-' if discard: - lang = lang[1:] - current_langs = filter(re.compile(lang + '$').match, all_sub_langs) + lang_re = lang_re[1:] + current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs) if discard: for lang in current_langs: - requested_langs.discard(lang) + while lang in requested_langs: + requested_langs.remove(lang) else: - requested_langs.update(current_langs) + requested_langs.extend(current_langs) + requested_langs = orderedSet(requested_langs) elif 'en' in available_subs: requested_langs = ['en'] else: requested_langs = [list(all_sub_langs)[0]] - self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs)) + if requested_langs: + self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs)) formats_query = self.params.get('subtitlesformat', 'best') formats_preference = formats_query.split('/') if formats_query else [] @@ -2278,11 +2423,13 @@ def print_optional(field): elif 'url' in info_dict: info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '') + if self.params.get('forceprint') or self.params.get('forcejson'): + self.post_extract(info_dict) for tmpl in self.params.get('forceprint', []): if re.match(r'\w+$', tmpl): tmpl = '%({})s'.format(tmpl) tmpl, info_copy = self.prepare_outtmpl(tmpl, info_dict) - self.to_stdout(tmpl % info_copy) + self.to_stdout(self.escape_outtmpl(tmpl) % info_copy) print_mandatory('title') print_mandatory('id') @@ -2290,15 +2437,16 @@ def print_optional(field): print_optional('thumbnail') print_optional('description') print_optional('filename') - if self.params.get('forceduration', False) and info_dict.get('duration') is not None: + if self.params.get('forceduration') and info_dict.get('duration') is not None: self.to_stdout(formatSeconds(info_dict['duration'])) print_mandatory('format') - if self.params.get('forcejson', False): - self.post_extract(info_dict) - self.to_stdout(json.dumps(info_dict, default=repr)) + if self.params.get('forcejson'): + self.to_stdout(json.dumps(self.sanitize_info(info_dict))) def dl(self, name, info, subtitle=False, test=False): + if not info.get('url'): + self.raise_no_formats(info, True) if test: verbose = self.params.get('verbose') @@ -2315,7 +2463,7 @@ def dl(self, name, info, subtitle=False, test=False): } else: params = self.params - fd = get_suitable_downloader(info, params)(self, params) + fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params) if not test: for ph in self._progress_hooks: fd.add_progress_hook(ph) @@ -2331,8 +2479,6 @@ def process_info(self, info_dict): assert info_dict.get('_type', 'video') == 'video' - info_dict.setdefault('__postprocessors', []) - max_downloads = self.params.get('max_downloads') if max_downloads is not None: if self._num_downloads >= int(max_downloads): @@ -2341,7 +2487,7 @@ def process_info(self, info_dict): # TODO: backward compatibility, to be removed info_dict['fulltitle'] = info_dict['title'] - if 'format' not in info_dict: + if 'format' not in info_dict and 'ext' in info_dict: info_dict['format'] = info_dict['ext'] if self._match_entry(info_dict) is not None: @@ -2356,42 +2502,48 @@ def process_info(self, info_dict): files_to_move = {} # Forced printings - self.__forced_printings(info_dict, full_filename, incomplete=False) + self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) - if self.params.get('simulate', False): + if self.params.get('simulate'): if self.params.get('force_write_download_archive', False): self.record_download_archive(info_dict) - # Do nothing else if in simulate mode return if full_filename is None: return - if not self._ensure_dir_exists(encodeFilename(full_filename)): return if not self._ensure_dir_exists(encodeFilename(temp_filename)): return - if self.params.get('writedescription', False): - descfn = self.prepare_filename(info_dict, 'description') - if not self._ensure_dir_exists(encodeFilename(descfn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)): - self.to_screen('[info] Video description is already present') - elif info_dict.get('description') is None: - self.report_warning('There\'s no description to write.') - else: - try: - self.to_screen('[info] Writing video description to: ' + descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(info_dict['description']) - except (OSError, IOError): - self.report_error('Cannot write description file ' + descfn) - return + if self._write_description('video', info_dict, + self.prepare_filename(info_dict, 'description')) is None: + return + sub_files = self._write_subtitles(info_dict, temp_filename) + if sub_files is None: + return + files_to_move.update(dict(sub_files)) + + thumb_files = self._write_thumbnails( + 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail')) + if thumb_files is None: + return + files_to_move.update(dict(thumb_files)) + + infofn = self.prepare_filename(info_dict, 'infojson') + _infojson_written = self._write_info_json('video', info_dict, infofn) + if _infojson_written: + info_dict['__infojson_filename'] = infofn + elif _infojson_written is None: + return + + # Note: Annotations are deprecated + annofn = None if self.params.get('writeannotations', False): annofn = self.prepare_filename(info_dict, 'annotation') + if annofn: if not self._ensure_dir_exists(encodeFilename(annofn)): return if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)): @@ -2409,67 +2561,6 @@ def process_info(self, info_dict): self.report_error('Cannot write annotations file: ' + annofn) return - subtitles_are_requested = any([self.params.get('writesubtitles', False), - self.params.get('writeautomaticsub')]) - - if subtitles_are_requested and info_dict.get('requested_subtitles'): - # subtitles download errors are already managed as troubles in relevant IE - # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['requested_subtitles'] - # ie = self.get_info_extractor(info_dict['extractor_key']) - for sub_lang, sub_info in subtitles.items(): - sub_format = sub_info['ext'] - sub_filename = subtitles_filename(temp_filename, sub_lang, sub_format, info_dict.get('ext')) - sub_filename_final = subtitles_filename( - self.prepare_filename(info_dict, 'subtitle'), sub_lang, sub_format, info_dict.get('ext')) - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) - sub_info['filepath'] = sub_filename - files_to_move[sub_filename] = sub_filename_final - else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - if sub_info.get('data') is not None: - try: - # Use newline='' to prevent conversion of newline characters - # See https://github.com/ytdl-org/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_info['data']) - sub_info['filepath'] = sub_filename - files_to_move[sub_filename] = sub_filename_final - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return - else: - try: - self.dl(sub_filename, sub_info.copy(), subtitle=True) - sub_info['filepath'] = sub_filename - files_to_move[sub_filename] = sub_filename_final - except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err))) - continue - - if self.params.get('writeinfojson', False): - infofn = self.prepare_filename(info_dict, 'infojson') - if not self._ensure_dir_exists(encodeFilename(infofn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)): - self.to_screen('[info] Video metadata is already present') - else: - self.to_screen('[info] Writing video metadata as JSON to: ' + infofn) - try: - write_json_file(self.filter_requested_info(info_dict, self.params.get('clean_infojson', True)), infofn) - except (OSError, IOError): - self.report_error('Cannot write video metadata to JSON file ' + infofn) - return - info_dict['__infojson_filename'] = infofn - - for thumb_ext in self._write_thumbnails(info_dict, temp_filename): - thumb_filename_temp = replace_extension(temp_filename, thumb_ext, info_dict.get('ext')) - thumb_filename = replace_extension( - self.prepare_filename(info_dict, 'thumbnail'), thumb_ext, info_dict.get('ext')) - files_to_move[thumb_filename_temp] = thumb_filename - # Write internet shortcut files url_link = webloc_link = desktop_link = False if self.params.get('writelink', False): @@ -2533,6 +2624,7 @@ def _write_link_file(extension, template, newline, embed_filename): info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict) else: # Download + info_dict.setdefault('__postprocessors', []) try: def existing_file(*filepaths): @@ -2553,7 +2645,6 @@ def existing_file(*filepaths): os.remove(encodeFilename(file)) return None - self.report_file_already_downloaded(existing_files[0]) info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:] return existing_files[0] @@ -2581,25 +2672,21 @@ def compatible_formats(formats): requested_formats = info_dict['requested_formats'] old_ext = info_dict['ext'] - if self.params.get('merge_output_format') is None: - if not compatible_formats(requested_formats): - info_dict['ext'] = 'mkv' - self.report_warning( - 'Requested formats are incompatible for merge and will be merged into mkv.') - if (info_dict['ext'] == 'webm' - and self.params.get('writethumbnail', False) - and info_dict.get('thumbnails')): - info_dict['ext'] = 'mkv' - self.report_warning( - 'webm doesn\'t support embedding a thumbnail, mkv will be used.') + if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): + info_dict['ext'] = 'mkv' + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv.') + new_ext = info_dict['ext'] - def correct_ext(filename): + def correct_ext(filename, ext=new_ext): + if filename == '-': + return filename filename_real_ext = os.path.splitext(filename)[1][1:] filename_wo_ext = ( os.path.splitext(filename)[0] - if filename_real_ext == old_ext + if filename_real_ext in (old_ext, new_ext) else filename) - return '%s.%s' % (filename_wo_ext, info_dict['ext']) + return '%s.%s' % (filename_wo_ext, ext) # Ensure filename always has a correct extension for successful merge full_filename = correct_ext(full_filename) @@ -2608,20 +2695,16 @@ def correct_ext(filename): info_dict['__real_download'] = False _protocols = set(determine_protocol(f) for f in requested_formats) - if len(_protocols) == 1: + if len(_protocols) == 1: # All requested formats have same protocol info_dict['protocol'] = _protocols.pop() - directly_mergable = ( - 'no-direct-merge' not in self.params.get('compat_opts', []) - and info_dict.get('protocol') is not None # All requested formats have same protocol - and not self.params.get('allow_unplayable_formats') - and get_suitable_downloader(info_dict, self.params).__name__ == 'FFmpegFD') - if directly_mergable: - info_dict['url'] = requested_formats[0]['url'] - # Treat it as a single download - dl_filename = existing_file(full_filename, temp_filename) - if dl_filename is None: - success, real_download = self.dl(temp_filename, info_dict) - info_dict['__real_download'] = real_download + directly_mergable = FFmpegFD.can_merge_formats(info_dict, self.params) + if dl_filename is not None: + self.report_file_already_downloaded(dl_filename) + elif (directly_mergable and get_suitable_downloader( + info_dict, self.params, to_stdout=(temp_filename == '-')) == FFmpegFD): + info_dict['url'] = '\n'.join(f['url'] for f in requested_formats) + success, real_download = self.dl(temp_filename, info_dict) + info_dict['__real_download'] = real_download else: downloaded = [] merger = FFmpegMergerPP(self) @@ -2635,34 +2718,47 @@ def correct_ext(filename): 'You have requested merging of multiple formats but ffmpeg is not installed. ' 'The formats won\'t be merged.') - if dl_filename is None: - for f in requested_formats: - new_info = dict(info_dict) - del new_info['requested_formats'] - new_info.update(f) + if temp_filename == '-': + reason = ('using a downloader other than ffmpeg' if directly_mergable + else 'but the formats are incompatible for simultaneous download' if merger.available + else 'but ffmpeg is not installed') + self.report_warning( + f'You have requested downloading multiple formats to stdout {reason}. ' + 'The formats will be streamed one after the other') + fname = temp_filename + for f in requested_formats: + new_info = dict(info_dict) + del new_info['requested_formats'] + new_info.update(f) + if temp_filename != '-': fname = prepend_extension( - self.prepare_filename(new_info, 'temp'), + correct_ext(temp_filename, new_info['ext']), 'f%s' % f['format_id'], new_info['ext']) if not self._ensure_dir_exists(fname): return + f['filepath'] = fname downloaded.append(fname) - partial_success, real_download = self.dl(fname, new_info) - info_dict['__real_download'] = info_dict['__real_download'] or real_download - success = success and partial_success - if merger.available and not self.params.get('allow_unplayable_formats'): - info_dict['__postprocessors'].append(merger) - info_dict['__files_to_merge'] = downloaded - # Even if there were no downloads, it is being merged only now - info_dict['__real_download'] = True - else: - for file in downloaded: - files_to_move[file] = None + partial_success, real_download = self.dl(fname, new_info) + info_dict['__real_download'] = info_dict['__real_download'] or real_download + success = success and partial_success + if merger.available and not self.params.get('allow_unplayable_formats'): + info_dict['__postprocessors'].append(merger) + info_dict['__files_to_merge'] = downloaded + # Even if there were no downloads, it is being merged only now + info_dict['__real_download'] = True + else: + for file in downloaded: + files_to_move[file] = None else: # Just a single file dl_filename = existing_file(full_filename, temp_filename) - if dl_filename is None: + if dl_filename is None or dl_filename == temp_filename: + # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part. + # So we should try to resume the download success, real_download = self.dl(temp_filename, info_dict) info_dict['__real_download'] = real_download + else: + self.report_file_already_downloaded(dl_filename) dl_filename = dl_filename or temp_filename info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) @@ -2677,65 +2773,53 @@ def correct_ext(filename): return if success and full_filename != '-': - # Fixup content - fixup_policy = self.params.get('fixup') - if fixup_policy is None: - fixup_policy = 'detect_or_warn' - - INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg to fix this automatically.' - - stretched_ratio = info_dict.get('stretched_ratio') - if stretched_ratio is not None and stretched_ratio != 1: - if fixup_policy == 'warn': - self.report_warning('%s: Non-uniform pixel ratio (%s)' % ( - info_dict['id'], stretched_ratio)) - elif fixup_policy == 'detect_or_warn': - stretched_pp = FFmpegFixupStretchedPP(self) - if stretched_pp.available: - info_dict['__postprocessors'].append(stretched_pp) - else: - self.report_warning( - '%s: Non-uniform pixel ratio (%s). %s' - % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - if (info_dict.get('requested_formats') is None - and info_dict.get('container') == 'm4a_dash' - and info_dict.get('ext') == 'm4a'): - if fixup_policy == 'warn': - self.report_warning( - '%s: writing DASH m4a. ' - 'Only some players support this container.' - % info_dict['id']) - elif fixup_policy == 'detect_or_warn': - fixup_pp = FFmpegFixupM4aPP(self) - if fixup_pp.available: - info_dict['__postprocessors'].append(fixup_pp) - else: - self.report_warning( - '%s: writing DASH m4a. ' - 'Only some players support this container. %s' - % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - - if ('protocol' in info_dict - and get_suitable_downloader(info_dict, self.params).__name__ == 'HlsFD'): - if fixup_policy == 'warn': - self.report_warning('%s: malformed AAC bitstream detected.' % ( - info_dict['id'])) - elif fixup_policy == 'detect_or_warn': - fixup_pp = FFmpegFixupM3u8PP(self) - if fixup_pp.available: - info_dict['__postprocessors'].append(fixup_pp) - else: - self.report_warning( - '%s: malformed AAC bitstream detected. %s' - % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') + def fixup(): + do_fixup = True + fixup_policy = self.params.get('fixup') + vid = info_dict['id'] + if fixup_policy in ('ignore', 'never'): + return + elif fixup_policy == 'warn': + do_fixup = False + elif fixup_policy != 'force': + assert fixup_policy in ('detect_or_warn', None) + if not info_dict.get('__real_download'): + do_fixup = False + + def ffmpeg_fixup(cndn, msg, cls): + if not cndn: + return + if not do_fixup: + self.report_warning(f'{vid}: {msg}') + return + pp = cls(self) + if pp.available: + info_dict['__postprocessors'].append(pp) + else: + self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically') + + stretched_ratio = info_dict.get('stretched_ratio') + ffmpeg_fixup( + stretched_ratio not in (1, None), + f'Non-uniform pixel ratio {stretched_ratio}', + FFmpegFixupStretchedPP) + + ffmpeg_fixup( + (info_dict.get('requested_formats') is None + and info_dict.get('container') == 'm4a_dash' + and info_dict.get('ext') == 'm4a'), + 'writing DASH m4a. Only some players support this container', + FFmpegFixupM4aPP) + + downloader = (get_suitable_downloader(info_dict, self.params).__name__ + if 'protocol' in info_dict else None) + ffmpeg_fixup(downloader == 'HlsFD', 'malformed AAC bitstream detected', FFmpegFixupM3u8PP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP) + + fixup() try: info_dict = self.post_process(dl_filename, info_dict, files_to_move) except PostProcessingError as err: @@ -2772,18 +2856,18 @@ def download(self, url_list): except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: - self.to_screen('[info] Maximum number of downloaded files reached') + self.to_screen('[info] Maximum number of downloads reached') raise except ExistingVideoReached: - self.to_screen('[info] Encountered a file that is already in the archive, stopping due to --break-on-existing') + self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing') raise except RejectedVideoReached: - self.to_screen('[info] Encountered a file that did not match filter, stopping due to --break-on-reject') + self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject') raise else: if self.params.get('dump_single_json', False): self.post_extract(res) - self.to_stdout(json.dumps(res, default=repr)) + self.to_stdout(json.dumps(self.sanitize_info(res))) return self._download_retcode @@ -2792,10 +2876,10 @@ def download_with_info_file(self, info_filename): [info_filename], mode='r', openhook=fileinput.hook_encoded('utf-8'))) as f: # FileInput doesn't have a read method, we can't call json.load - info = self.filter_requested_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True)) + info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True)) try: self.process_ie_result(info, download=True) - except (DownloadError, EntryNotInPlaylist): + except (DownloadError, EntryNotInPlaylist, ThrottledDownload): webpage_url = info.get('webpage_url') if webpage_url is not None: self.report_warning('The info failed to download, trying with "%s"' % webpage_url) @@ -2805,16 +2889,22 @@ def download_with_info_file(self, info_filename): return self._download_retcode @staticmethod - def filter_requested_info(info_dict, actually_filter=True): - remove_keys = ['__original_infodict'] # Always remove this since this may contain a copy of the entire dict + def sanitize_info(info_dict, remove_private_keys=False): + ''' Sanitize the infodict for converting to json ''' + if info_dict is None: + return info_dict + info_dict.setdefault('epoch', int(time.time())) + remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict keep_keys = ['_type'], # Always keep this to facilitate load-info-json - if actually_filter: - remove_keys += ('requested_formats', 'requested_subtitles', 'requested_entries', 'filepath', 'entries', 'original_url') + if remove_private_keys: + remove_keys |= { + 'requested_formats', 'requested_subtitles', 'requested_entries', + 'filepath', 'entries', 'original_url', 'playlist_autonumber', + } empty_values = (None, {}, [], set(), tuple()) reject = lambda k, v: k not in keep_keys and ( k.startswith('_') or k in remove_keys or v in empty_values) else: - info_dict['epoch'] = int(time.time()) reject = lambda k, v: k in remove_keys filter_fn = lambda obj: ( list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set)) @@ -2822,14 +2912,26 @@ def filter_requested_info(info_dict, actually_filter=True): else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v))) return filter_fn(info_dict) + @staticmethod + def filter_requested_info(info_dict, actually_filter=True): + ''' Alias of sanitize_info for backward compatibility ''' + return YoutubeDL.sanitize_info(info_dict, actually_filter) + def run_pp(self, pp, infodict): files_to_delete = [] if '__files_to_move' not in infodict: infodict['__files_to_move'] = {} - files_to_delete, infodict = pp.run(infodict) + try: + files_to_delete, infodict = pp.run(infodict) + except PostProcessingError as e: + # Must be True and not 'only_download' + if self.params.get('ignoreerrors') is True: + self.report_error(e) + return infodict + raise + if not files_to_delete: return infodict - if self.params.get('keepvideo', False): for f in files_to_delete: infodict['__files_to_move'].setdefault(f, '') @@ -2896,9 +2998,9 @@ def _make_archive_id(self, info_dict): if not url: return # Try to find matching extractor for the URL and take its ie_key - for ie in self._ies: + for ie_key, ie in self._ies.items(): if ie.suitable(url): - extractor = ie.ie_key() + extractor = ie_key break else: return @@ -2928,6 +3030,8 @@ def record_download_archive(self, info_dict): @staticmethod def format_resolution(format, default='unknown'): if format.get('vcodec') == 'none': + if format.get('acodec') == 'none': + return 'images' return 'audio only' if format.get('resolution') is not None: return format['resolution'] @@ -2997,22 +3101,11 @@ def _format_note(self, fdict): res += '~' + format_bytes(fdict['filesize_approx']) return res - def _format_note_table(self, f): - def join_fields(*vargs): - return ', '.join((val for val in vargs if val != '')) - - return join_fields( - 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '', - format_field(f, 'language', '[%s]'), - format_field(f, 'format_note'), - format_field(f, 'container', ignore=(None, f.get('ext'))), - format_field(f, 'asr', '%5dHz')) - def list_formats(self, info_dict): formats = info_dict.get('formats', [info_dict]) new_format = ( 'list-formats' not in self.params.get('compat_opts', []) - and self.params.get('list_formats_as_table', True) is not False) + and self.params.get('listformats_table', True) is not False) if new_format: table = [ [ @@ -3030,11 +3123,15 @@ def list_formats(self, info_dict): format_field(f, 'acodec', default='unknown').replace('none', ''), format_field(f, 'abr', '%3dk'), format_field(f, 'asr', '%5dHz'), - self._format_note_table(f)] - for f in formats - if f.get('preference') is None or f['preference'] >= -1000] + ', '.join(filter(None, ( + 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '', + format_field(f, 'language', '[%s]'), + format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + ))), + ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO', - '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'NOTE'] + '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO'] else: table = [ [ @@ -3047,12 +3144,9 @@ def list_formats(self, info_dict): header_line = ['format code', 'extension', 'resolution', 'note'] self.to_screen( - '[info] Available formats for %s:\n%s' % (info_dict['id'], render_table( - header_line, - table, - delim=new_format, - extraGap=(0 if new_format else 1), - hideEmpty=new_format))) + '[info] Available formats for %s:' % info_dict['id']) + self.to_stdout(render_table( + header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format)) def list_thumbnails(self, info_dict): thumbnails = list(info_dict.get('thumbnails')) @@ -3062,7 +3156,7 @@ def list_thumbnails(self, info_dict): self.to_screen( '[info] Thumbnails for %s:' % info_dict['id']) - self.to_screen(render_table( + self.to_stdout(render_table( ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) @@ -3074,12 +3168,12 @@ def list_subtitles(self, video_id, subtitles, name='subtitles'): 'Available %s for %s:' % (name, video_id)) def _row(lang, formats): - exts, names = zip(*((f['ext'], f.get('name', 'unknown')) for f in reversed(formats))) + exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats))) if len(set(names)) == 1: names = [] if names[0] == 'unknown' else names[:1] return [lang, ', '.join(names), ', '.join(exts)] - self.to_screen(render_table( + self.to_stdout(render_table( ['Language', 'Name', 'Formats'], [_row(lang, formats) for lang, formats in subtitles.items()], hideEmpty=True)) @@ -3094,11 +3188,6 @@ def print_debug_header(self): if not self.params.get('verbose'): return - if type('') is not compat_str: - # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326) - self.report_warning( - 'Your Python is broken! Update to a newer and supported version') - stdout_encoding = getattr( sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__) encoding_str = ( @@ -3109,17 +3198,14 @@ def print_debug_header(self): self.get_encoding())) write_string(encoding_str, encoding=None) - source = ( - '(exe)' if hasattr(sys, 'frozen') - else '(zip)' if isinstance(globals().get('__loader__'), zipimporter) - else '(source)' if os.path.basename(sys.argv[0]) == '__main__.py' - else '') - self._write_string('[debug] yt-dlp version %s %s\n' % (__version__, source)) + source = detect_variant() + self._write_string('[debug] yt-dlp version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})')) if _LAZY_LOADER: self._write_string('[debug] Lazy loading extractors enabled\n') - if _PLUGIN_CLASSES: - self._write_string( - '[debug] Plugin Extractors: %s\n' % [ie.ie_key() for ie in _PLUGIN_CLASSES]) + if plugin_extractors or plugin_postprocessors: + self._write_string('[debug] Plugins: %s\n' % [ + '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') + for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) if self.params.get('compat_opts'): self._write_string( '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts'))) @@ -3154,14 +3240,23 @@ def python_implementation(): exe_versions['rtmpdump'] = rtmpdump_version() exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( - '%s %s' % (exe, v) - for exe, v in sorted(exe_versions.items()) - if v - ) - if not exe_str: - exe_str = 'none' + f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v + ) or 'none' self._write_string('[debug] exe versions: %s\n' % exe_str) + from .downloader.websocket import has_websockets + from .postprocessor.embedthumbnail import has_mutagen + from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE + + lib_str = ', '.join(sorted(filter(None, ( + compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], + has_websockets and 'websockets', + has_mutagen and 'mutagen', + SQLITE_AVAILABLE and 'sqlite', + KEYRING_AVAILABLE and 'keyring', + )))) or 'none' + self._write_string('[debug] Optional libraries: %s\n' % lib_str) + proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): @@ -3184,16 +3279,11 @@ def _setup_opener(self): timeout_val = self.params.get('socket_timeout') self._socket_timeout = 600 if timeout_val is None else float(timeout_val) + opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser') opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') - if opts_cookiefile is None: - self.cookiejar = compat_cookiejar.CookieJar() - else: - opts_cookiefile = expand_path(opts_cookiefile) - self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) - if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load(ignore_discard=True, ignore_expires=True) + self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self) cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: @@ -3249,39 +3339,133 @@ def get_encoding(self): encoding = preferredencoding() return encoding - def _write_thumbnails(self, info_dict, filename): # return the extensions + def _write_info_json(self, label, ie_result, infofn): + ''' Write infojson and returns True = written, False = skip, None = error ''' + if not self.params.get('writeinfojson'): + return False + elif not infofn: + self.write_debug(f'Skipping writing {label} infojson') + return False + elif not self._ensure_dir_exists(infofn): + return None + elif not self.params.get('overwrites', True) and os.path.exists(infofn): + self.to_screen(f'[info] {label.title()} metadata is already present') + else: + self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') + try: + write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) + except (OSError, IOError): + self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') + return None + return True + + def _write_description(self, label, ie_result, descfn): + ''' Write description and returns True = written, False = skip, None = error ''' + if not self.params.get('writedescription'): + return False + elif not descfn: + self.write_debug(f'Skipping writing {label} description') + return False + elif not self._ensure_dir_exists(descfn): + return None + elif not self.params.get('overwrites', True) and os.path.exists(descfn): + self.to_screen(f'[info] {label.title()} description is already present') + elif ie_result.get('description') is None: + self.report_warning(f'There\'s no {label} description to write') + return False + else: + try: + self.to_screen(f'[info] Writing {label} description to: {descfn}') + with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(ie_result['description']) + except (OSError, IOError): + self.report_error(f'Cannot write {label} description file {descfn}') + return None + return True + + def _write_subtitles(self, info_dict, filename): + ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error''' + ret = [] + subtitles = info_dict.get('requested_subtitles') + if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + return ret + + sub_filename_base = self.prepare_filename(info_dict, 'subtitle') + if not sub_filename_base: + self.to_screen('[info] Skipping writing video subtitles') + return ret + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) + sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext')) + if not self.params.get('overwrites', True) and os.path.exists(sub_filename): + self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present') + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + continue + + self.to_screen(f'[info] Writing video subtitles to: {sub_filename}') + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/ytdl-org/youtube-dl/issues/10268 + with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + continue + except (OSError, IOError): + self.report_error(f'Cannot write video subtitles file {sub_filename}') + return None + + try: + sub_copy = sub_info.copy() + sub_copy.setdefault('http_headers', info_dict.get('http_headers')) + self.dl(sub_filename, sub_copy, subtitle=True) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: + self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}') + continue + return ret + + def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): + ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) ''' write_all = self.params.get('write_all_thumbnails', False) - thumbnails = [] + thumbnails, ret = [], [] if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] multiple = write_all and len(thumbnails) > 1 - ret = [] - for t in thumbnails[::1 if write_all else -1]: - thumb_ext = determine_ext(t['url'], 'jpg') - suffix = '%s.' % t['id'] if multiple else '' - thumb_display_id = '%s ' % t['id'] if multiple else '' - thumb_filename = replace_extension(filename, suffix + thumb_ext, info_dict.get('ext')) - - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(thumb_filename)): - ret.append(suffix + thumb_ext) + if thumb_filename_base is None: + thumb_filename_base = filename + if thumbnails and not thumb_filename_base: + self.write_debug(f'Skipping writing {label} thumbnail') + return ret + + for t in thumbnails[::-1]: + thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') + thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '') + thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) + thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) + + if not self.params.get('overwrites', True) and os.path.exists(thumb_filename): + ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename - self.to_screen('[%s] %s: Thumbnail %sis already present' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) + self.to_screen(f'[info] {thumb_display_id.title()} is already present') else: - self.to_screen('[%s] %s: Downloading thumbnail %s ...' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) + self.to_screen(f'[info] Downloading {thumb_display_id} ...') try: uf = self.urlopen(t['url']) + self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) - ret.append(suffix + thumb_ext) - self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % - (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) + ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename except network_exceptions as err: - self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], error_to_compat_str(err))) + self.report_warning(f'Unable to download {thumb_display_id}: {err}') if ret and not write_all: break return ret