]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
Allow using a custom format selector through API
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import functools
13 import io
14 import itertools
15 import json
16 import locale
17 import operator
18 import os
19 import platform
20 import re
21 import shutil
22 import subprocess
23 import sys
24 import tempfile
25 import time
26 import tokenize
27 import traceback
28 import random
29 import unicodedata
30
31 from enum import Enum
32 from string import ascii_letters
33
34 from .compat import (
35 compat_basestring,
36 compat_get_terminal_size,
37 compat_kwargs,
38 compat_numeric_types,
39 compat_os_name,
40 compat_pycrypto_AES,
41 compat_shlex_quote,
42 compat_str,
43 compat_tokenize_tokenize,
44 compat_urllib_error,
45 compat_urllib_request,
46 compat_urllib_request_DataHandler,
47 windows_enable_vt_mode,
48 )
49 from .cookies import load_cookies
50 from .utils import (
51 age_restricted,
52 args_to_str,
53 ContentTooShortError,
54 date_from_str,
55 DateRange,
56 DEFAULT_OUTTMPL,
57 determine_ext,
58 determine_protocol,
59 DownloadCancelled,
60 DownloadError,
61 encode_compat_str,
62 encodeFilename,
63 EntryNotInPlaylist,
64 error_to_compat_str,
65 ExistingVideoReached,
66 expand_path,
67 ExtractorError,
68 float_or_none,
69 format_bytes,
70 format_field,
71 formatSeconds,
72 GeoRestrictedError,
73 HEADRequest,
74 int_or_none,
75 iri_to_uri,
76 ISO3166Utils,
77 join_nonempty,
78 LazyList,
79 LINK_TEMPLATES,
80 locked_file,
81 make_dir,
82 make_HTTPS_handler,
83 MaxDownloadsReached,
84 network_exceptions,
85 number_of_digits,
86 orderedSet,
87 OUTTMPL_TYPES,
88 PagedList,
89 parse_filesize,
90 PerRequestProxyHandler,
91 platform_name,
92 Popen,
93 PostProcessingError,
94 preferredencoding,
95 prepend_extension,
96 register_socks_protocols,
97 RejectedVideoReached,
98 render_table,
99 replace_extension,
100 SameFileError,
101 sanitize_filename,
102 sanitize_path,
103 sanitize_url,
104 sanitized_Request,
105 std_headers,
106 STR_FORMAT_RE_TMPL,
107 STR_FORMAT_TYPES,
108 str_or_none,
109 strftime_or_none,
110 subtitles_filename,
111 supports_terminal_sequences,
112 ThrottledDownload,
113 to_high_limit_path,
114 traverse_obj,
115 try_get,
116 UnavailableVideoError,
117 url_basename,
118 variadic,
119 version_tuple,
120 write_json_file,
121 write_string,
122 YoutubeDLCookieProcessor,
123 YoutubeDLHandler,
124 YoutubeDLRedirectHandler,
125 )
126 from .cache import Cache
127 from .minicurses import format_text
128 from .extractor import (
129 gen_extractor_classes,
130 get_info_extractor,
131 _LAZY_LOADER,
132 _PLUGIN_CLASSES as plugin_extractors
133 )
134 from .extractor.openload import PhantomJSwrapper
135 from .downloader import (
136 FFmpegFD,
137 get_suitable_downloader,
138 shorten_protocol_name
139 )
140 from .downloader.rtmp import rtmpdump_version
141 from .postprocessor import (
142 get_postprocessor,
143 EmbedThumbnailPP,
144 FFmpegFixupDurationPP,
145 FFmpegFixupM3u8PP,
146 FFmpegFixupM4aPP,
147 FFmpegFixupStretchedPP,
148 FFmpegFixupTimestampPP,
149 FFmpegMergerPP,
150 FFmpegPostProcessor,
151 MoveFilesAfterDownloadPP,
152 _PLUGIN_CLASSES as plugin_postprocessors
153 )
154 from .update import detect_variant
155 from .version import __version__
156
157 if compat_os_name == 'nt':
158 import ctypes
159
160
161 class YoutubeDL(object):
162 """YoutubeDL class.
163
164 YoutubeDL objects are the ones responsible of downloading the
165 actual video file and writing it to disk if the user has requested
166 it, among some other tasks. In most cases there should be one per
167 program. As, given a video URL, the downloader doesn't know how to
168 extract all the needed information, task that InfoExtractors do, it
169 has to pass the URL to one of them.
170
171 For this, YoutubeDL objects have a method that allows
172 InfoExtractors to be registered in a given order. When it is passed
173 a URL, the YoutubeDL object handles it to the first InfoExtractor it
174 finds that reports being able to handle it. The InfoExtractor extracts
175 all the information about the video or videos the URL refers to, and
176 YoutubeDL process the extracted information, possibly using a File
177 Downloader to download the video.
178
179 YoutubeDL objects accept a lot of parameters. In order not to saturate
180 the object constructor with arguments, it receives a dictionary of
181 options instead. These options are available through the params
182 attribute for the InfoExtractors to use. The YoutubeDL also
183 registers itself as the downloader in charge for the InfoExtractors
184 that are added to it, so this is a "mutual registration".
185
186 Available options:
187
188 username: Username for authentication purposes.
189 password: Password for authentication purposes.
190 videopassword: Password for accessing a video.
191 ap_mso: Adobe Pass multiple-system operator identifier.
192 ap_username: Multiple-system operator account username.
193 ap_password: Multiple-system operator account password.
194 usenetrc: Use netrc for authentication instead.
195 verbose: Print additional info to stdout.
196 quiet: Do not print messages to stdout.
197 no_warnings: Do not print out anything for warnings.
198 forceprint: A list of templates to force print
199 forceurl: Force printing final URL. (Deprecated)
200 forcetitle: Force printing title. (Deprecated)
201 forceid: Force printing ID. (Deprecated)
202 forcethumbnail: Force printing thumbnail URL. (Deprecated)
203 forcedescription: Force printing description. (Deprecated)
204 forcefilename: Force printing final filename. (Deprecated)
205 forceduration: Force printing duration. (Deprecated)
206 forcejson: Force printing info_dict as JSON.
207 dump_single_json: Force printing the info_dict of the whole playlist
208 (or video) as a single JSON line.
209 force_write_download_archive: Force writing download archive regardless
210 of 'skip_download' or 'simulate'.
211 simulate: Do not download the video files. If unset (or None),
212 simulate only if listsubtitles, listformats or list_thumbnails is used
213 format: Video format code. see "FORMAT SELECTION" for more details.
214 You can also pass a function. The function takes 'ctx' as
215 argument and returns the formats to download.
216 See "build_format_selector" for an implementation
217 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
218 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
219 extracting metadata even if the video is not actually
220 available for download (experimental)
221 format_sort: A list of fields by which to sort the video formats.
222 See "Sorting Formats" for more details.
223 format_sort_force: Force the given format_sort. see "Sorting Formats"
224 for more details.
225 allow_multiple_video_streams: Allow multiple video streams to be merged
226 into a single file
227 allow_multiple_audio_streams: Allow multiple audio streams to be merged
228 into a single file
229 check_formats Whether to test if the formats are downloadable.
230 Can be True (check all), False (check none),
231 'selected' (check selected formats),
232 or None (check only if requested by extractor)
233 paths: Dictionary of output paths. The allowed keys are 'home'
234 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
235 outtmpl: Dictionary of templates for output names. Allowed keys
236 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
237 For compatibility with youtube-dl, a single string can also be used
238 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
239 restrictfilenames: Do not allow "&" and spaces in file names
240 trim_file_name: Limit length of filename (extension excluded)
241 windowsfilenames: Force the filenames to be windows compatible
242 ignoreerrors: Do not stop on download/postprocessing errors.
243 Can be 'only_download' to ignore only download errors.
244 Default is 'only_download' for CLI, but False for API
245 skip_playlist_after_errors: Number of allowed failures until the rest of
246 the playlist is skipped
247 force_generic_extractor: Force downloader to use the generic extractor
248 overwrites: Overwrite all video and metadata files if True,
249 overwrite only non-video files if None
250 and don't overwrite any file if False
251 For compatibility with youtube-dl,
252 "nooverwrites" may also be used instead
253 playliststart: Playlist item to start at.
254 playlistend: Playlist item to end at.
255 playlist_items: Specific indices of playlist to download.
256 playlistreverse: Download playlist items in reverse order.
257 playlistrandom: Download playlist items in random order.
258 matchtitle: Download only matching titles.
259 rejecttitle: Reject downloads for matching titles.
260 logger: Log messages to a logging.Logger instance.
261 logtostderr: Log messages to stderr instead of stdout.
262 consoletitle: Display progress in console window's titlebar.
263 writedescription: Write the video description to a .description file
264 writeinfojson: Write the video description to a .info.json file
265 clean_infojson: Remove private fields from the infojson
266 getcomments: Extract video comments. This will not be written to disk
267 unless writeinfojson is also given
268 writeannotations: Write the video annotations to a .annotations.xml file
269 writethumbnail: Write the thumbnail image to a file
270 allow_playlist_files: Whether to write playlists' description, infojson etc
271 also to disk when using the 'write*' options
272 write_all_thumbnails: Write all thumbnail formats to files
273 writelink: Write an internet shortcut file, depending on the
274 current platform (.url/.webloc/.desktop)
275 writeurllink: Write a Windows internet shortcut file (.url)
276 writewebloclink: Write a macOS internet shortcut file (.webloc)
277 writedesktoplink: Write a Linux internet shortcut file (.desktop)
278 writesubtitles: Write the video subtitles to a file
279 writeautomaticsub: Write the automatically generated subtitles to a file
280 allsubtitles: Deprecated - Use subtitleslangs = ['all']
281 Downloads all the subtitles of the video
282 (requires writesubtitles or writeautomaticsub)
283 listsubtitles: Lists all available subtitles for the video
284 subtitlesformat: The format code for subtitles
285 subtitleslangs: List of languages of the subtitles to download (can be regex).
286 The list may contain "all" to refer to all the available
287 subtitles. The language can be prefixed with a "-" to
288 exclude it from the requested languages. Eg: ['all', '-live_chat']
289 keepvideo: Keep the video file after post-processing
290 daterange: A DateRange object, download only if the upload_date is in the range.
291 skip_download: Skip the actual download of the video file
292 cachedir: Location of the cache files in the filesystem.
293 False to disable filesystem cache.
294 noplaylist: Download single video instead of a playlist if in doubt.
295 age_limit: An integer representing the user's age in years.
296 Unsuitable videos for the given age are skipped.
297 min_views: An integer representing the minimum view count the video
298 must have in order to not be skipped.
299 Videos without view count information are always
300 downloaded. None for no limit.
301 max_views: An integer representing the maximum view count.
302 Videos that are more popular than that are not
303 downloaded.
304 Videos without view count information are always
305 downloaded. None for no limit.
306 download_archive: File name of a file where all downloads are recorded.
307 Videos already present in the file are not downloaded
308 again.
309 break_on_existing: Stop the download process after attempting to download a
310 file that is in the archive.
311 break_on_reject: Stop the download process when encountering a video that
312 has been filtered out.
313 cookiefile: File name where cookies should be read from and dumped to
314 cookiesfrombrowser: A tuple containing the name of the browser and the profile
315 name/path from where cookies are loaded.
316 Eg: ('chrome', ) or ('vivaldi', 'default')
317 nocheckcertificate:Do not verify SSL certificates
318 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
319 At the moment, this is only supported by YouTube.
320 proxy: URL of the proxy server to use
321 geo_verification_proxy: URL of the proxy to use for IP address verification
322 on geo-restricted sites.
323 socket_timeout: Time to wait for unresponsive hosts, in seconds
324 bidi_workaround: Work around buggy terminals without bidirectional text
325 support, using fridibi
326 debug_printtraffic:Print out sent and received HTTP traffic
327 include_ads: Download ads as well
328 default_search: Prepend this string if an input url is not valid.
329 'auto' for elaborate guessing
330 encoding: Use this encoding instead of the system-specified.
331 extract_flat: Do not resolve URLs, return the immediate result.
332 Pass in 'in_playlist' to only show this behavior for
333 playlist items.
334 postprocessors: A list of dictionaries, each with an entry
335 * key: The name of the postprocessor. See
336 yt_dlp/postprocessor/__init__.py for a list.
337 * when: When to run the postprocessor. Can be one of
338 pre_process|before_dl|post_process|after_move.
339 Assumed to be 'post_process' if not given
340 post_hooks: Deprecated - Register a custom postprocessor instead
341 A list of functions that get called as the final step
342 for each video file, after all postprocessors have been
343 called. The filename will be passed as the only argument.
344 progress_hooks: A list of functions that get called on download
345 progress, with a dictionary with the entries
346 * status: One of "downloading", "error", or "finished".
347 Check this first and ignore unknown values.
348 * info_dict: The extracted info_dict
349
350 If status is one of "downloading", or "finished", the
351 following properties may also be present:
352 * filename: The final filename (always present)
353 * tmpfilename: The filename we're currently writing to
354 * downloaded_bytes: Bytes on disk
355 * total_bytes: Size of the whole file, None if unknown
356 * total_bytes_estimate: Guess of the eventual file size,
357 None if unavailable.
358 * elapsed: The number of seconds since download started.
359 * eta: The estimated time in seconds, None if unknown
360 * speed: The download speed in bytes/second, None if
361 unknown
362 * fragment_index: The counter of the currently
363 downloaded video fragment.
364 * fragment_count: The number of fragments (= individual
365 files that will be merged)
366
367 Progress hooks are guaranteed to be called at least once
368 (with status "finished") if the download is successful.
369 postprocessor_hooks: A list of functions that get called on postprocessing
370 progress, with a dictionary with the entries
371 * status: One of "started", "processing", or "finished".
372 Check this first and ignore unknown values.
373 * postprocessor: Name of the postprocessor
374 * info_dict: The extracted info_dict
375
376 Progress hooks are guaranteed to be called at least twice
377 (with status "started" and "finished") if the processing is successful.
378 merge_output_format: Extension to use when merging formats.
379 final_ext: Expected final extension; used to detect when the file was
380 already downloaded and converted
381 fixup: Automatically correct known faults of the file.
382 One of:
383 - "never": do nothing
384 - "warn": only emit a warning
385 - "detect_or_warn": check whether we can do anything
386 about it, warn otherwise (default)
387 source_address: Client-side IP address to bind to.
388 call_home: Boolean, true iff we are allowed to contact the
389 yt-dlp servers for debugging. (BROKEN)
390 sleep_interval_requests: Number of seconds to sleep between requests
391 during extraction
392 sleep_interval: Number of seconds to sleep before each download when
393 used alone or a lower bound of a range for randomized
394 sleep before each download (minimum possible number
395 of seconds to sleep) when used along with
396 max_sleep_interval.
397 max_sleep_interval:Upper bound of a range for randomized sleep before each
398 download (maximum possible number of seconds to sleep).
399 Must only be used along with sleep_interval.
400 Actual sleep time will be a random float from range
401 [sleep_interval; max_sleep_interval].
402 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
403 listformats: Print an overview of available video formats and exit.
404 list_thumbnails: Print a table of all thumbnails and exit.
405 match_filter: A function that gets called with the info_dict of
406 every video.
407 If it returns a message, the video is ignored.
408 If it returns None, the video is downloaded.
409 match_filter_func in utils.py is one example for this.
410 no_color: Do not emit color codes in output.
411 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
412 HTTP header
413 geo_bypass_country:
414 Two-letter ISO 3166-2 country code that will be used for
415 explicit geographic restriction bypassing via faking
416 X-Forwarded-For HTTP header
417 geo_bypass_ip_block:
418 IP range in CIDR notation that will be used similarly to
419 geo_bypass_country
420
421 The following options determine which downloader is picked:
422 external_downloader: A dictionary of protocol keys and the executable of the
423 external downloader to use for it. The allowed protocols
424 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
425 Set the value to 'native' to use the native downloader
426 hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
427 or {'m3u8': 'ffmpeg'} instead.
428 Use the native HLS downloader instead of ffmpeg/avconv
429 if True, otherwise use ffmpeg/avconv if False, otherwise
430 use downloader suggested by extractor if None.
431 compat_opts: Compatibility options. See "Differences in default behavior".
432 The following options do not work when used through the API:
433 filename, abort-on-error, multistreams, no-live-chat, format-sort
434 no-clean-infojson, no-playlist-metafiles, no-keep-subs.
435 Refer __init__.py for their implementation
436 progress_template: Dictionary of templates for progress outputs.
437 Allowed keys are 'download', 'postprocess',
438 'download-title' (console title) and 'postprocess-title'.
439 The template is mapped on a dictionary with keys 'progress' and 'info'
440
441 The following parameters are not used by YoutubeDL itself, they are used by
442 the downloader (see yt_dlp/downloader/common.py):
443 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
444 max_filesize, test, noresizebuffer, retries, fragment_retries, continuedl,
445 noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
446 external_downloader_args, concurrent_fragment_downloads.
447
448 The following options are used by the post processors:
449 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
450 otherwise prefer ffmpeg. (avconv support is deprecated)
451 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
452 to the binary or its containing directory.
453 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
454 and a list of additional command-line arguments for the
455 postprocessor/executable. The dict can also have "PP+EXE" keys
456 which are used when the given exe is used by the given PP.
457 Use 'default' as the name for arguments to passed to all PP
458 For compatibility with youtube-dl, a single list of args
459 can also be used
460
461 The following options are used by the extractors:
462 extractor_retries: Number of times to retry for known errors
463 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
464 hls_split_discontinuity: Split HLS playlists to different formats at
465 discontinuities such as ad breaks (default: False)
466 extractor_args: A dictionary of arguments to be passed to the extractors.
467 See "EXTRACTOR ARGUMENTS" for details.
468 Eg: {'youtube': {'skip': ['dash', 'hls']}}
469 youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
470 If True (default), DASH manifests and related
471 data will be downloaded and processed by extractor.
472 You can reduce network I/O by disabling it if you don't
473 care about DASH. (only for youtube)
474 youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
475 If True (default), HLS manifests and related
476 data will be downloaded and processed by extractor.
477 You can reduce network I/O by disabling it if you don't
478 care about HLS. (only for youtube)
479 """
480
481 _NUMERIC_FIELDS = set((
482 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
483 'timestamp', 'release_timestamp',
484 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
485 'average_rating', 'comment_count', 'age_limit',
486 'start_time', 'end_time',
487 'chapter_number', 'season_number', 'episode_number',
488 'track_number', 'disc_number', 'release_year',
489 ))
490
491 _format_selection_exts = {
492 'audio': {'m4a', 'mp3', 'ogg', 'aac'},
493 'video': {'mp4', 'flv', 'webm', '3gp'},
494 'storyboards': {'mhtml'},
495 }
496
497 params = None
498 _ies = {}
499 _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
500 _printed_messages = set()
501 _first_webpage_request = True
502 _download_retcode = None
503 _num_downloads = None
504 _playlist_level = 0
505 _playlist_urls = set()
506 _screen_file = None
507
508 def __init__(self, params=None, auto_init=True):
509 """Create a FileDownloader object with the given options.
510 @param auto_init Whether to load the default extractors and print header (if verbose).
511 Set to 'no_verbose_header' to not print the header
512 """
513 if params is None:
514 params = {}
515 self._ies = {}
516 self._ies_instances = {}
517 self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
518 self._printed_messages = set()
519 self._first_webpage_request = True
520 self._post_hooks = []
521 self._progress_hooks = []
522 self._postprocessor_hooks = []
523 self._download_retcode = 0
524 self._num_downloads = 0
525 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
526 self._err_file = sys.stderr
527 self.params = params
528 self.cache = Cache(self)
529
530 windows_enable_vt_mode()
531 # FIXME: This will break if we ever print color to stdout
532 self._allow_colors = {
533 'screen': not self.params.get('no_color') and supports_terminal_sequences(self._screen_file),
534 'err': not self.params.get('no_color') and supports_terminal_sequences(self._err_file),
535 }
536
537 if sys.version_info < (3, 6):
538 self.report_warning(
539 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
540
541 if self.params.get('allow_unplayable_formats'):
542 self.report_warning(
543 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
544 'This is a developer option intended for debugging. \n'
545 ' If you experience any issues while using this option, '
546 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
547
548 def check_deprecated(param, option, suggestion):
549 if self.params.get(param) is not None:
550 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
551 return True
552 return False
553
554 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
555 if self.params.get('geo_verification_proxy') is None:
556 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
557
558 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
559 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
560 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
561
562 for msg in self.params.get('_warnings', []):
563 self.report_warning(msg)
564
565 if 'list-formats' in self.params.get('compat_opts', []):
566 self.params['listformats_table'] = False
567
568 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
569 # nooverwrites was unnecessarily changed to overwrites
570 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
571 # This ensures compatibility with both keys
572 self.params['overwrites'] = not self.params['nooverwrites']
573 elif self.params.get('overwrites') is None:
574 self.params.pop('overwrites', None)
575 else:
576 self.params['nooverwrites'] = not self.params['overwrites']
577
578 if params.get('bidi_workaround', False):
579 try:
580 import pty
581 master, slave = pty.openpty()
582 width = compat_get_terminal_size().columns
583 if width is None:
584 width_args = []
585 else:
586 width_args = ['-w', str(width)]
587 sp_kwargs = dict(
588 stdin=subprocess.PIPE,
589 stdout=slave,
590 stderr=self._err_file)
591 try:
592 self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
593 except OSError:
594 self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
595 self._output_channel = os.fdopen(master, 'rb')
596 except OSError as ose:
597 if ose.errno == errno.ENOENT:
598 self.report_warning(
599 'Could not find fribidi executable, ignoring --bidi-workaround. '
600 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
601 else:
602 raise
603
604 if (sys.platform != 'win32'
605 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
606 and not params.get('restrictfilenames', False)):
607 # Unicode filesystem API will throw errors (#1474, #13027)
608 self.report_warning(
609 'Assuming --restrict-filenames since file system encoding '
610 'cannot encode all characters. '
611 'Set the LC_ALL environment variable to fix this.')
612 self.params['restrictfilenames'] = True
613
614 self.outtmpl_dict = self.parse_outtmpl()
615
616 # Creating format selector here allows us to catch syntax errors before the extraction
617 self.format_selector = (
618 None if self.params.get('format') is None
619 else self.params['format'] if callable(self.params['format'])
620 else self.build_format_selector(self.params['format']))
621
622 self._setup_opener()
623
624 if auto_init:
625 if auto_init != 'no_verbose_header':
626 self.print_debug_header()
627 self.add_default_info_extractors()
628
629 for pp_def_raw in self.params.get('postprocessors', []):
630 pp_def = dict(pp_def_raw)
631 when = pp_def.pop('when', 'post_process')
632 pp_class = get_postprocessor(pp_def.pop('key'))
633 pp = pp_class(self, **compat_kwargs(pp_def))
634 self.add_post_processor(pp, when=when)
635
636 for ph in self.params.get('post_hooks', []):
637 self.add_post_hook(ph)
638
639 for ph in self.params.get('progress_hooks', []):
640 self.add_progress_hook(ph)
641
642 register_socks_protocols()
643
644 def preload_download_archive(fn):
645 """Preload the archive, if any is specified"""
646 if fn is None:
647 return False
648 self.write_debug(f'Loading archive file {fn!r}')
649 try:
650 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
651 for line in archive_file:
652 self.archive.add(line.strip())
653 except IOError as ioe:
654 if ioe.errno != errno.ENOENT:
655 raise
656 return False
657 return True
658
659 self.archive = set()
660 preload_download_archive(self.params.get('download_archive'))
661
662 def warn_if_short_id(self, argv):
663 # short YouTube ID starting with dash?
664 idxs = [
665 i for i, a in enumerate(argv)
666 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
667 if idxs:
668 correct_argv = (
669 ['yt-dlp']
670 + [a for i, a in enumerate(argv) if i not in idxs]
671 + ['--'] + [argv[i] for i in idxs]
672 )
673 self.report_warning(
674 'Long argument string detected. '
675 'Use -- to separate parameters and URLs, like this:\n%s' %
676 args_to_str(correct_argv))
677
678 def add_info_extractor(self, ie):
679 """Add an InfoExtractor object to the end of the list."""
680 ie_key = ie.ie_key()
681 self._ies[ie_key] = ie
682 if not isinstance(ie, type):
683 self._ies_instances[ie_key] = ie
684 ie.set_downloader(self)
685
686 def _get_info_extractor_class(self, ie_key):
687 ie = self._ies.get(ie_key)
688 if ie is None:
689 ie = get_info_extractor(ie_key)
690 self.add_info_extractor(ie)
691 return ie
692
693 def get_info_extractor(self, ie_key):
694 """
695 Get an instance of an IE with name ie_key, it will try to get one from
696 the _ies list, if there's no instance it will create a new one and add
697 it to the extractor list.
698 """
699 ie = self._ies_instances.get(ie_key)
700 if ie is None:
701 ie = get_info_extractor(ie_key)()
702 self.add_info_extractor(ie)
703 return ie
704
705 def add_default_info_extractors(self):
706 """
707 Add the InfoExtractors returned by gen_extractors to the end of the list
708 """
709 for ie in gen_extractor_classes():
710 self.add_info_extractor(ie)
711
712 def add_post_processor(self, pp, when='post_process'):
713 """Add a PostProcessor object to the end of the chain."""
714 self._pps[when].append(pp)
715 pp.set_downloader(self)
716
717 def add_post_hook(self, ph):
718 """Add the post hook"""
719 self._post_hooks.append(ph)
720
721 def add_progress_hook(self, ph):
722 """Add the download progress hook"""
723 self._progress_hooks.append(ph)
724
725 def add_postprocessor_hook(self, ph):
726 """Add the postprocessing progress hook"""
727 self._postprocessor_hooks.append(ph)
728
729 def _bidi_workaround(self, message):
730 if not hasattr(self, '_output_channel'):
731 return message
732
733 assert hasattr(self, '_output_process')
734 assert isinstance(message, compat_str)
735 line_count = message.count('\n') + 1
736 self._output_process.stdin.write((message + '\n').encode('utf-8'))
737 self._output_process.stdin.flush()
738 res = ''.join(self._output_channel.readline().decode('utf-8')
739 for _ in range(line_count))
740 return res[:-len('\n')]
741
742 def _write_string(self, message, out=None, only_once=False):
743 if only_once:
744 if message in self._printed_messages:
745 return
746 self._printed_messages.add(message)
747 write_string(message, out=out, encoding=self.params.get('encoding'))
748
749 def to_stdout(self, message, skip_eol=False, quiet=False):
750 """Print message to stdout"""
751 if self.params.get('logger'):
752 self.params['logger'].debug(message)
753 elif not quiet or self.params.get('verbose'):
754 self._write_string(
755 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
756 self._err_file if quiet else self._screen_file)
757
758 def to_stderr(self, message, only_once=False):
759 """Print message to stderr"""
760 assert isinstance(message, compat_str)
761 if self.params.get('logger'):
762 self.params['logger'].error(message)
763 else:
764 self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
765
766 def to_console_title(self, message):
767 if not self.params.get('consoletitle', False):
768 return
769 if compat_os_name == 'nt':
770 if ctypes.windll.kernel32.GetConsoleWindow():
771 # c_wchar_p() might not be necessary if `message` is
772 # already of type unicode()
773 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
774 elif 'TERM' in os.environ:
775 self._write_string('\033]0;%s\007' % message, self._screen_file)
776
777 def save_console_title(self):
778 if not self.params.get('consoletitle', False):
779 return
780 if self.params.get('simulate'):
781 return
782 if compat_os_name != 'nt' and 'TERM' in os.environ:
783 # Save the title on stack
784 self._write_string('\033[22;0t', self._screen_file)
785
786 def restore_console_title(self):
787 if not self.params.get('consoletitle', False):
788 return
789 if self.params.get('simulate'):
790 return
791 if compat_os_name != 'nt' and 'TERM' in os.environ:
792 # Restore the title from stack
793 self._write_string('\033[23;0t', self._screen_file)
794
795 def __enter__(self):
796 self.save_console_title()
797 return self
798
799 def __exit__(self, *args):
800 self.restore_console_title()
801
802 if self.params.get('cookiefile') is not None:
803 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
804
805 def trouble(self, message=None, tb=None):
806 """Determine action to take when a download problem appears.
807
808 Depending on if the downloader has been configured to ignore
809 download errors or not, this method may throw an exception or
810 not when errors are found, after printing the message.
811
812 tb, if given, is additional traceback information.
813 """
814 if message is not None:
815 self.to_stderr(message)
816 if self.params.get('verbose'):
817 if tb is None:
818 if sys.exc_info()[0]: # if .trouble has been called from an except block
819 tb = ''
820 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
821 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
822 tb += encode_compat_str(traceback.format_exc())
823 else:
824 tb_data = traceback.format_list(traceback.extract_stack())
825 tb = ''.join(tb_data)
826 if tb:
827 self.to_stderr(tb)
828 if not self.params.get('ignoreerrors'):
829 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
830 exc_info = sys.exc_info()[1].exc_info
831 else:
832 exc_info = sys.exc_info()
833 raise DownloadError(message, exc_info)
834 self._download_retcode = 1
835
836 def to_screen(self, message, skip_eol=False):
837 """Print message to stdout if not in quiet mode"""
838 self.to_stdout(
839 message, skip_eol, quiet=self.params.get('quiet', False))
840
841 class Styles(Enum):
842 HEADERS = 'yellow'
843 EMPHASIS = 'blue'
844 ID = 'green'
845 DELIM = 'blue'
846 ERROR = 'red'
847 WARNING = 'yellow'
848
849 def __format_text(self, out, text, f, fallback=None, *, test_encoding=False):
850 assert out in ('screen', 'err')
851 if test_encoding:
852 original_text = text
853 handle = self._screen_file if out == 'screen' else self._err_file
854 encoding = self.params.get('encoding') or getattr(handle, 'encoding', 'ascii')
855 text = text.encode(encoding, 'ignore').decode(encoding)
856 if fallback is not None and text != original_text:
857 text = fallback
858 if isinstance(f, self.Styles):
859 f = f._value_
860 return format_text(text, f) if self._allow_colors[out] else text if fallback is None else fallback
861
862 def _format_screen(self, *args, **kwargs):
863 return self.__format_text('screen', *args, **kwargs)
864
865 def _format_err(self, *args, **kwargs):
866 return self.__format_text('err', *args, **kwargs)
867
868 def report_warning(self, message, only_once=False):
869 '''
870 Print the message to stderr, it will be prefixed with 'WARNING:'
871 If stderr is a tty file the 'WARNING:' will be colored
872 '''
873 if self.params.get('logger') is not None:
874 self.params['logger'].warning(message)
875 else:
876 if self.params.get('no_warnings'):
877 return
878 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
879
880 def report_error(self, message, tb=None):
881 '''
882 Do the same as trouble, but prefixes the message with 'ERROR:', colored
883 in red if stderr is a tty file.
884 '''
885 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', tb)
886
887 def write_debug(self, message, only_once=False):
888 '''Log debug message or Print message to stderr'''
889 if not self.params.get('verbose', False):
890 return
891 message = '[debug] %s' % message
892 if self.params.get('logger'):
893 self.params['logger'].debug(message)
894 else:
895 self.to_stderr(message, only_once)
896
897 def report_file_already_downloaded(self, file_name):
898 """Report file has already been fully downloaded."""
899 try:
900 self.to_screen('[download] %s has already been downloaded' % file_name)
901 except UnicodeEncodeError:
902 self.to_screen('[download] The file has already been downloaded')
903
904 def report_file_delete(self, file_name):
905 """Report that existing file will be deleted."""
906 try:
907 self.to_screen('Deleting existing file %s' % file_name)
908 except UnicodeEncodeError:
909 self.to_screen('Deleting existing file')
910
911 def raise_no_formats(self, info, forced=False):
912 has_drm = info.get('__has_drm')
913 msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
914 expected = self.params.get('ignore_no_formats_error')
915 if forced or not expected:
916 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
917 expected=has_drm or expected)
918 else:
919 self.report_warning(msg)
920
921 def parse_outtmpl(self):
922 outtmpl_dict = self.params.get('outtmpl', {})
923 if not isinstance(outtmpl_dict, dict):
924 outtmpl_dict = {'default': outtmpl_dict}
925 # Remove spaces in the default template
926 if self.params.get('restrictfilenames'):
927 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
928 else:
929 sanitize = lambda x: x
930 outtmpl_dict.update({
931 k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items()
932 if outtmpl_dict.get(k) is None})
933 for key, val in outtmpl_dict.items():
934 if isinstance(val, bytes):
935 self.report_warning(
936 'Parameter outtmpl is bytes, but should be a unicode string. '
937 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
938 return outtmpl_dict
939
940 def get_output_path(self, dir_type='', filename=None):
941 paths = self.params.get('paths', {})
942 assert isinstance(paths, dict)
943 path = os.path.join(
944 expand_path(paths.get('home', '').strip()),
945 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
946 filename or '')
947
948 # Temporary fix for #4787
949 # 'Treat' all problem characters by passing filename through preferredencoding
950 # to workaround encoding issues with subprocess on python2 @ Windows
951 if sys.version_info < (3, 0) and sys.platform == 'win32':
952 path = encodeFilename(path, True).decode(preferredencoding())
953 return sanitize_path(path, force=self.params.get('windowsfilenames'))
954
955 @staticmethod
956 def _outtmpl_expandpath(outtmpl):
957 # expand_path translates '%%' into '%' and '$$' into '$'
958 # correspondingly that is not what we want since we need to keep
959 # '%%' intact for template dict substitution step. Working around
960 # with boundary-alike separator hack.
961 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
962 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
963
964 # outtmpl should be expand_path'ed before template dict substitution
965 # because meta fields may contain env variables we don't want to
966 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
967 # title "Hello $PATH", we don't want `$PATH` to be expanded.
968 return expand_path(outtmpl).replace(sep, '')
969
970 @staticmethod
971 def escape_outtmpl(outtmpl):
972 ''' Escape any remaining strings like %s, %abc% etc. '''
973 return re.sub(
974 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
975 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
976 outtmpl)
977
978 @classmethod
979 def validate_outtmpl(cls, outtmpl):
980 ''' @return None or Exception object '''
981 outtmpl = re.sub(
982 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'),
983 lambda mobj: f'{mobj.group(0)[:-1]}s',
984 cls._outtmpl_expandpath(outtmpl))
985 try:
986 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
987 return None
988 except ValueError as err:
989 return err
990
991 @staticmethod
992 def _copy_infodict(info_dict):
993 info_dict = dict(info_dict)
994 for key in ('__original_infodict', '__postprocessors'):
995 info_dict.pop(key, None)
996 return info_dict
997
998 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
999 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict """
1000 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1001
1002 info_dict = self._copy_infodict(info_dict)
1003 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1004 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1005 if info_dict.get('duration', None) is not None
1006 else None)
1007 info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
1008 if info_dict.get('resolution') is None:
1009 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1010
1011 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1012 # of %(field)s to %(field)0Nd for backward compatibility
1013 field_size_compat_map = {
1014 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0),
1015 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1016 'autonumber': self.params.get('autonumber_size') or 5,
1017 }
1018
1019 TMPL_DICT = {}
1020 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]'))
1021 MATH_FUNCTIONS = {
1022 '+': float.__add__,
1023 '-': float.__sub__,
1024 }
1025 # Field is of the form key1.key2...
1026 # where keys (except first) can be string, int or slice
1027 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
1028 MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
1029 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
1030 INTERNAL_FORMAT_RE = re.compile(r'''(?x)
1031 (?P<negate>-)?
1032 (?P<fields>{field})
1033 (?P<maths>(?:{math_op}{math_field})*)
1034 (?:>(?P<strf_format>.+?))?
1035 (?P<alternate>(?<!\\),[^|)]+)?
1036 (?:\|(?P<default>.*?))?
1037 $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
1038
1039 def _traverse_infodict(k):
1040 k = k.split('.')
1041 if k[0] == '':
1042 k.pop(0)
1043 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
1044
1045 def get_value(mdict):
1046 # Object traversal
1047 value = _traverse_infodict(mdict['fields'])
1048 # Negative
1049 if mdict['negate']:
1050 value = float_or_none(value)
1051 if value is not None:
1052 value *= -1
1053 # Do maths
1054 offset_key = mdict['maths']
1055 if offset_key:
1056 value = float_or_none(value)
1057 operator = None
1058 while offset_key:
1059 item = re.match(
1060 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1061 offset_key).group(0)
1062 offset_key = offset_key[len(item):]
1063 if operator is None:
1064 operator = MATH_FUNCTIONS[item]
1065 continue
1066 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1067 offset = float_or_none(item)
1068 if offset is None:
1069 offset = float_or_none(_traverse_infodict(item))
1070 try:
1071 value = operator(value, multiplier * offset)
1072 except (TypeError, ZeroDivisionError):
1073 return None
1074 operator = None
1075 # Datetime formatting
1076 if mdict['strf_format']:
1077 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1078
1079 return value
1080
1081 na = self.params.get('outtmpl_na_placeholder', 'NA')
1082
1083 def _dumpjson_default(obj):
1084 if isinstance(obj, (set, LazyList)):
1085 return list(obj)
1086 raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
1087
1088 def create_key(outer_mobj):
1089 if not outer_mobj.group('has_key'):
1090 return outer_mobj.group(0)
1091 key = outer_mobj.group('key')
1092 mobj = re.match(INTERNAL_FORMAT_RE, key)
1093 initial_field = mobj.group('fields').split('.')[-1] if mobj else ''
1094 value, default = None, na
1095 while mobj:
1096 mobj = mobj.groupdict()
1097 default = mobj['default'] if mobj['default'] is not None else default
1098 value = get_value(mobj)
1099 if value is None and mobj['alternate']:
1100 mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:])
1101 else:
1102 break
1103
1104 fmt = outer_mobj.group('format')
1105 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1106 fmt = '0{:d}d'.format(field_size_compat_map[key])
1107
1108 value = default if value is None else value
1109
1110 flags = outer_mobj.group('conversion') or ''
1111 str_fmt = f'{fmt[:-1]}s'
1112 if fmt[-1] == 'l': # list
1113 delim = '\n' if '#' in flags else ', '
1114 value, fmt = delim.join(variadic(value)), str_fmt
1115 elif fmt[-1] == 'j': # json
1116 value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
1117 elif fmt[-1] == 'q': # quoted
1118 value = map(str, variadic(value) if '#' in flags else [value])
1119 value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
1120 elif fmt[-1] == 'B': # bytes
1121 value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
1122 value, fmt = value.decode('utf-8', 'ignore'), 's'
1123 elif fmt[-1] == 'U': # unicode normalized
1124 value, fmt = unicodedata.normalize(
1125 # "+" = compatibility equivalence, "#" = NFD
1126 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1127 value), str_fmt
1128 elif fmt[-1] == 'c':
1129 if value:
1130 value = str(value)[0]
1131 else:
1132 fmt = str_fmt
1133 elif fmt[-1] not in 'rs': # numeric
1134 value = float_or_none(value)
1135 if value is None:
1136 value, fmt = default, 's'
1137
1138 if sanitize:
1139 if fmt[-1] == 'r':
1140 # If value is an object, sanitize might convert it to a string
1141 # So we convert it to repr first
1142 value, fmt = repr(value), str_fmt
1143 if fmt[-1] in 'csr':
1144 value = sanitize(initial_field, value)
1145
1146 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1147 TMPL_DICT[key] = value
1148 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1149
1150 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1151
1152 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1153 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1154 return self.escape_outtmpl(outtmpl) % info_dict
1155
1156 def _prepare_filename(self, info_dict, tmpl_type='default'):
1157 try:
1158 sanitize = lambda k, v: sanitize_filename(
1159 compat_str(v),
1160 restricted=self.params.get('restrictfilenames'),
1161 is_id=(k == 'id' or k.endswith('_id')))
1162 outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']))
1163 filename = self.evaluate_outtmpl(outtmpl, info_dict, sanitize)
1164
1165 force_ext = OUTTMPL_TYPES.get(tmpl_type)
1166 if filename and force_ext is not None:
1167 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1168
1169 # https://github.com/blackjack4494/youtube-dlc/issues/85
1170 trim_file_name = self.params.get('trim_file_name', False)
1171 if trim_file_name:
1172 fn_groups = filename.rsplit('.')
1173 ext = fn_groups[-1]
1174 sub_ext = ''
1175 if len(fn_groups) > 2:
1176 sub_ext = fn_groups[-2]
1177 filename = join_nonempty(fn_groups[0][:trim_file_name], sub_ext, ext, delim='.')
1178
1179 return filename
1180 except ValueError as err:
1181 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1182 return None
1183
1184 def prepare_filename(self, info_dict, dir_type='', warn=False):
1185 """Generate the output filename."""
1186
1187 filename = self._prepare_filename(info_dict, dir_type or 'default')
1188 if not filename and dir_type not in ('', 'temp'):
1189 return ''
1190
1191 if warn:
1192 if not self.params.get('paths'):
1193 pass
1194 elif filename == '-':
1195 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1196 elif os.path.isabs(filename):
1197 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1198 if filename == '-' or not filename:
1199 return filename
1200
1201 return self.get_output_path(dir_type, filename)
1202
1203 def _match_entry(self, info_dict, incomplete=False, silent=False):
1204 """ Returns None if the file should be downloaded """
1205
1206 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1207
1208 def check_filter():
1209 if 'title' in info_dict:
1210 # This can happen when we're just evaluating the playlist
1211 title = info_dict['title']
1212 matchtitle = self.params.get('matchtitle', False)
1213 if matchtitle:
1214 if not re.search(matchtitle, title, re.IGNORECASE):
1215 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1216 rejecttitle = self.params.get('rejecttitle', False)
1217 if rejecttitle:
1218 if re.search(rejecttitle, title, re.IGNORECASE):
1219 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1220 date = info_dict.get('upload_date')
1221 if date is not None:
1222 dateRange = self.params.get('daterange', DateRange())
1223 if date not in dateRange:
1224 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1225 view_count = info_dict.get('view_count')
1226 if view_count is not None:
1227 min_views = self.params.get('min_views')
1228 if min_views is not None and view_count < min_views:
1229 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1230 max_views = self.params.get('max_views')
1231 if max_views is not None and view_count > max_views:
1232 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1233 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1234 return 'Skipping "%s" because it is age restricted' % video_title
1235
1236 match_filter = self.params.get('match_filter')
1237 if match_filter is not None:
1238 try:
1239 ret = match_filter(info_dict, incomplete=incomplete)
1240 except TypeError:
1241 # For backward compatibility
1242 ret = None if incomplete else match_filter(info_dict)
1243 if ret is not None:
1244 return ret
1245 return None
1246
1247 if self.in_download_archive(info_dict):
1248 reason = '%s has already been recorded in the archive' % video_title
1249 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1250 else:
1251 reason = check_filter()
1252 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1253 if reason is not None:
1254 if not silent:
1255 self.to_screen('[download] ' + reason)
1256 if self.params.get(break_opt, False):
1257 raise break_err()
1258 return reason
1259
1260 @staticmethod
1261 def add_extra_info(info_dict, extra_info):
1262 '''Set the keys from extra_info in info dict if they are missing'''
1263 for key, value in extra_info.items():
1264 info_dict.setdefault(key, value)
1265
1266 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1267 process=True, force_generic_extractor=False):
1268 """
1269 Return a list with a dictionary for each video extracted.
1270
1271 Arguments:
1272 url -- URL to extract
1273
1274 Keyword arguments:
1275 download -- whether to download videos during extraction
1276 ie_key -- extractor key hint
1277 extra_info -- dictionary containing the extra values to add to each result
1278 process -- whether to resolve all unresolved references (URLs, playlist items),
1279 must be True for download to work.
1280 force_generic_extractor -- force using the generic extractor
1281 """
1282
1283 if extra_info is None:
1284 extra_info = {}
1285
1286 if not ie_key and force_generic_extractor:
1287 ie_key = 'Generic'
1288
1289 if ie_key:
1290 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1291 else:
1292 ies = self._ies
1293
1294 for ie_key, ie in ies.items():
1295 if not ie.suitable(url):
1296 continue
1297
1298 if not ie.working():
1299 self.report_warning('The program functionality for this site has been marked as broken, '
1300 'and will probably not work.')
1301
1302 temp_id = ie.get_temp_id(url)
1303 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1304 self.to_screen("[%s] %s: has already been recorded in archive" % (
1305 ie_key, temp_id))
1306 break
1307 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1308 else:
1309 self.report_error('no suitable InfoExtractor for URL %s' % url)
1310
1311 def __handle_extraction_exceptions(func):
1312 @functools.wraps(func)
1313 def wrapper(self, *args, **kwargs):
1314 try:
1315 return func(self, *args, **kwargs)
1316 except GeoRestrictedError as e:
1317 msg = e.msg
1318 if e.countries:
1319 msg += '\nThis video is available in %s.' % ', '.join(
1320 map(ISO3166Utils.short2full, e.countries))
1321 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1322 self.report_error(msg)
1323 except ExtractorError as e: # An error we somewhat expected
1324 self.report_error(compat_str(e), e.format_traceback())
1325 except ThrottledDownload as e:
1326 self.to_stderr('\r')
1327 self.report_warning(f'{e}; Re-extracting data')
1328 return wrapper(self, *args, **kwargs)
1329 except (DownloadCancelled, LazyList.IndexError):
1330 raise
1331 except Exception as e:
1332 if self.params.get('ignoreerrors'):
1333 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1334 else:
1335 raise
1336 return wrapper
1337
1338 @__handle_extraction_exceptions
1339 def __extract_info(self, url, ie, download, extra_info, process):
1340 ie_result = ie.extract(url)
1341 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1342 return
1343 if isinstance(ie_result, list):
1344 # Backwards compatibility: old IE result format
1345 ie_result = {
1346 '_type': 'compat_list',
1347 'entries': ie_result,
1348 }
1349 if extra_info.get('original_url'):
1350 ie_result.setdefault('original_url', extra_info['original_url'])
1351 self.add_default_extra_info(ie_result, ie, url)
1352 if process:
1353 return self.process_ie_result(ie_result, download, extra_info)
1354 else:
1355 return ie_result
1356
1357 def add_default_extra_info(self, ie_result, ie, url):
1358 if url is not None:
1359 self.add_extra_info(ie_result, {
1360 'webpage_url': url,
1361 'original_url': url,
1362 'webpage_url_basename': url_basename(url),
1363 })
1364 if ie is not None:
1365 self.add_extra_info(ie_result, {
1366 'extractor': ie.IE_NAME,
1367 'extractor_key': ie.ie_key(),
1368 })
1369
1370 def process_ie_result(self, ie_result, download=True, extra_info=None):
1371 """
1372 Take the result of the ie(may be modified) and resolve all unresolved
1373 references (URLs, playlist items).
1374
1375 It will also download the videos if 'download'.
1376 Returns the resolved ie_result.
1377 """
1378 if extra_info is None:
1379 extra_info = {}
1380 result_type = ie_result.get('_type', 'video')
1381
1382 if result_type in ('url', 'url_transparent'):
1383 ie_result['url'] = sanitize_url(ie_result['url'])
1384 if ie_result.get('original_url'):
1385 extra_info.setdefault('original_url', ie_result['original_url'])
1386
1387 extract_flat = self.params.get('extract_flat', False)
1388 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1389 or extract_flat is True):
1390 info_copy = ie_result.copy()
1391 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1392 if ie and not ie_result.get('id'):
1393 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1394 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1395 self.add_extra_info(info_copy, extra_info)
1396 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1397 if self.params.get('force_write_download_archive', False):
1398 self.record_download_archive(info_copy)
1399 return ie_result
1400
1401 if result_type == 'video':
1402 self.add_extra_info(ie_result, extra_info)
1403 ie_result = self.process_video_result(ie_result, download=download)
1404 additional_urls = (ie_result or {}).get('additional_urls')
1405 if additional_urls:
1406 # TODO: Improve MetadataParserPP to allow setting a list
1407 if isinstance(additional_urls, compat_str):
1408 additional_urls = [additional_urls]
1409 self.to_screen(
1410 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1411 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1412 ie_result['additional_entries'] = [
1413 self.extract_info(
1414 url, download, extra_info,
1415 force_generic_extractor=self.params.get('force_generic_extractor'))
1416 for url in additional_urls
1417 ]
1418 return ie_result
1419 elif result_type == 'url':
1420 # We have to add extra_info to the results because it may be
1421 # contained in a playlist
1422 return self.extract_info(
1423 ie_result['url'], download,
1424 ie_key=ie_result.get('ie_key'),
1425 extra_info=extra_info)
1426 elif result_type == 'url_transparent':
1427 # Use the information from the embedding page
1428 info = self.extract_info(
1429 ie_result['url'], ie_key=ie_result.get('ie_key'),
1430 extra_info=extra_info, download=False, process=False)
1431
1432 # extract_info may return None when ignoreerrors is enabled and
1433 # extraction failed with an error, don't crash and return early
1434 # in this case
1435 if not info:
1436 return info
1437
1438 force_properties = dict(
1439 (k, v) for k, v in ie_result.items() if v is not None)
1440 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1441 if f in force_properties:
1442 del force_properties[f]
1443 new_result = info.copy()
1444 new_result.update(force_properties)
1445
1446 # Extracted info may not be a video result (i.e.
1447 # info.get('_type', 'video') != video) but rather an url or
1448 # url_transparent. In such cases outer metadata (from ie_result)
1449 # should be propagated to inner one (info). For this to happen
1450 # _type of info should be overridden with url_transparent. This
1451 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1452 if new_result.get('_type') == 'url':
1453 new_result['_type'] = 'url_transparent'
1454
1455 return self.process_ie_result(
1456 new_result, download=download, extra_info=extra_info)
1457 elif result_type in ('playlist', 'multi_video'):
1458 # Protect from infinite recursion due to recursively nested playlists
1459 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1460 webpage_url = ie_result['webpage_url']
1461 if webpage_url in self._playlist_urls:
1462 self.to_screen(
1463 '[download] Skipping already downloaded playlist: %s'
1464 % ie_result.get('title') or ie_result.get('id'))
1465 return
1466
1467 self._playlist_level += 1
1468 self._playlist_urls.add(webpage_url)
1469 self._sanitize_thumbnails(ie_result)
1470 try:
1471 return self.__process_playlist(ie_result, download)
1472 finally:
1473 self._playlist_level -= 1
1474 if not self._playlist_level:
1475 self._playlist_urls.clear()
1476 elif result_type == 'compat_list':
1477 self.report_warning(
1478 'Extractor %s returned a compat_list result. '
1479 'It needs to be updated.' % ie_result.get('extractor'))
1480
1481 def _fixup(r):
1482 self.add_extra_info(r, {
1483 'extractor': ie_result['extractor'],
1484 'webpage_url': ie_result['webpage_url'],
1485 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1486 'extractor_key': ie_result['extractor_key'],
1487 })
1488 return r
1489 ie_result['entries'] = [
1490 self.process_ie_result(_fixup(r), download, extra_info)
1491 for r in ie_result['entries']
1492 ]
1493 return ie_result
1494 else:
1495 raise Exception('Invalid result type: %s' % result_type)
1496
1497 def _ensure_dir_exists(self, path):
1498 return make_dir(path, self.report_error)
1499
1500 def __process_playlist(self, ie_result, download):
1501 # We process each entry in the playlist
1502 playlist = ie_result.get('title') or ie_result.get('id')
1503 self.to_screen('[download] Downloading playlist: %s' % playlist)
1504
1505 if 'entries' not in ie_result:
1506 raise EntryNotInPlaylist('There are no entries')
1507 incomplete_entries = bool(ie_result.get('requested_entries'))
1508 if incomplete_entries:
1509 def fill_missing_entries(entries, indexes):
1510 ret = [None] * max(*indexes)
1511 for i, entry in zip(indexes, entries):
1512 ret[i - 1] = entry
1513 return ret
1514 ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1515
1516 playlist_results = []
1517
1518 playliststart = self.params.get('playliststart', 1)
1519 playlistend = self.params.get('playlistend')
1520 # For backwards compatibility, interpret -1 as whole list
1521 if playlistend == -1:
1522 playlistend = None
1523
1524 playlistitems_str = self.params.get('playlist_items')
1525 playlistitems = None
1526 if playlistitems_str is not None:
1527 def iter_playlistitems(format):
1528 for string_segment in format.split(','):
1529 if '-' in string_segment:
1530 start, end = string_segment.split('-')
1531 for item in range(int(start), int(end) + 1):
1532 yield int(item)
1533 else:
1534 yield int(string_segment)
1535 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1536
1537 ie_entries = ie_result['entries']
1538 msg = (
1539 'Downloading %d videos' if not isinstance(ie_entries, list)
1540 else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1541
1542 if isinstance(ie_entries, list):
1543 def get_entry(i):
1544 return ie_entries[i - 1]
1545 else:
1546 if not isinstance(ie_entries, (PagedList, LazyList)):
1547 ie_entries = LazyList(ie_entries)
1548
1549 def get_entry(i):
1550 return YoutubeDL.__handle_extraction_exceptions(
1551 lambda self, i: ie_entries[i - 1]
1552 )(self, i)
1553
1554 entries = []
1555 items = playlistitems if playlistitems is not None else itertools.count(playliststart)
1556 for i in items:
1557 if i == 0:
1558 continue
1559 if playlistitems is None and playlistend is not None and playlistend < i:
1560 break
1561 entry = None
1562 try:
1563 entry = get_entry(i)
1564 if entry is None:
1565 raise EntryNotInPlaylist()
1566 except (IndexError, EntryNotInPlaylist):
1567 if incomplete_entries:
1568 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
1569 elif not playlistitems:
1570 break
1571 entries.append(entry)
1572 try:
1573 if entry is not None:
1574 self._match_entry(entry, incomplete=True, silent=True)
1575 except (ExistingVideoReached, RejectedVideoReached):
1576 break
1577 ie_result['entries'] = entries
1578
1579 # Save playlist_index before re-ordering
1580 entries = [
1581 ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry)
1582 for i, entry in enumerate(entries, 1)
1583 if entry is not None]
1584 n_entries = len(entries)
1585
1586 if not playlistitems and (playliststart or playlistend):
1587 playlistitems = list(range(playliststart, playliststart + n_entries))
1588 ie_result['requested_entries'] = playlistitems
1589
1590 if not self.params.get('simulate') and self.params.get('allow_playlist_files', True):
1591 ie_copy = {
1592 'playlist': playlist,
1593 'playlist_id': ie_result.get('id'),
1594 'playlist_title': ie_result.get('title'),
1595 'playlist_uploader': ie_result.get('uploader'),
1596 'playlist_uploader_id': ie_result.get('uploader_id'),
1597 'playlist_index': 0,
1598 'n_entries': n_entries,
1599 }
1600 ie_copy.update(dict(ie_result))
1601
1602 if self._write_info_json('playlist', ie_result,
1603 self.prepare_filename(ie_copy, 'pl_infojson')) is None:
1604 return
1605 if self._write_description('playlist', ie_result,
1606 self.prepare_filename(ie_copy, 'pl_description')) is None:
1607 return
1608 # TODO: This should be passed to ThumbnailsConvertor if necessary
1609 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1610
1611 if self.params.get('playlistreverse', False):
1612 entries = entries[::-1]
1613 if self.params.get('playlistrandom', False):
1614 random.shuffle(entries)
1615
1616 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1617
1618 self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1619 failures = 0
1620 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1621 for i, entry_tuple in enumerate(entries, 1):
1622 playlist_index, entry = entry_tuple
1623 if 'playlist-index' in self.params.get('compat_opts', []):
1624 playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1
1625 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1626 # This __x_forwarded_for_ip thing is a bit ugly but requires
1627 # minimal changes
1628 if x_forwarded_for:
1629 entry['__x_forwarded_for_ip'] = x_forwarded_for
1630 extra = {
1631 'n_entries': n_entries,
1632 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1633 'playlist_index': playlist_index,
1634 'playlist_autonumber': i,
1635 'playlist': playlist,
1636 'playlist_id': ie_result.get('id'),
1637 'playlist_title': ie_result.get('title'),
1638 'playlist_uploader': ie_result.get('uploader'),
1639 'playlist_uploader_id': ie_result.get('uploader_id'),
1640 'extractor': ie_result['extractor'],
1641 'webpage_url': ie_result['webpage_url'],
1642 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1643 'extractor_key': ie_result['extractor_key'],
1644 }
1645
1646 if self._match_entry(entry, incomplete=True) is not None:
1647 continue
1648
1649 entry_result = self.__process_iterable_entry(entry, download, extra)
1650 if not entry_result:
1651 failures += 1
1652 if failures >= max_failures:
1653 self.report_error(
1654 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1655 break
1656 # TODO: skip failed (empty) entries?
1657 playlist_results.append(entry_result)
1658 ie_result['entries'] = playlist_results
1659 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1660 return ie_result
1661
1662 @__handle_extraction_exceptions
1663 def __process_iterable_entry(self, entry, download, extra_info):
1664 return self.process_ie_result(
1665 entry, download=download, extra_info=extra_info)
1666
1667 def _build_format_filter(self, filter_spec):
1668 " Returns a function to filter the formats according to the filter_spec "
1669
1670 OPERATORS = {
1671 '<': operator.lt,
1672 '<=': operator.le,
1673 '>': operator.gt,
1674 '>=': operator.ge,
1675 '=': operator.eq,
1676 '!=': operator.ne,
1677 }
1678 operator_rex = re.compile(r'''(?x)\s*
1679 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1680 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1681 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1682 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1683 m = operator_rex.fullmatch(filter_spec)
1684 if m:
1685 try:
1686 comparison_value = int(m.group('value'))
1687 except ValueError:
1688 comparison_value = parse_filesize(m.group('value'))
1689 if comparison_value is None:
1690 comparison_value = parse_filesize(m.group('value') + 'B')
1691 if comparison_value is None:
1692 raise ValueError(
1693 'Invalid value %r in format specification %r' % (
1694 m.group('value'), filter_spec))
1695 op = OPERATORS[m.group('op')]
1696
1697 if not m:
1698 STR_OPERATORS = {
1699 '=': operator.eq,
1700 '^=': lambda attr, value: attr.startswith(value),
1701 '$=': lambda attr, value: attr.endswith(value),
1702 '*=': lambda attr, value: value in attr,
1703 }
1704 str_operator_rex = re.compile(r'''(?x)\s*
1705 (?P<key>[a-zA-Z0-9._-]+)\s*
1706 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1707 (?P<value>[a-zA-Z0-9._-]+)\s*
1708 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1709 m = str_operator_rex.fullmatch(filter_spec)
1710 if m:
1711 comparison_value = m.group('value')
1712 str_op = STR_OPERATORS[m.group('op')]
1713 if m.group('negation'):
1714 op = lambda attr, value: not str_op(attr, value)
1715 else:
1716 op = str_op
1717
1718 if not m:
1719 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1720
1721 def _filter(f):
1722 actual_value = f.get(m.group('key'))
1723 if actual_value is None:
1724 return m.group('none_inclusive')
1725 return op(actual_value, comparison_value)
1726 return _filter
1727
1728 def _check_formats(self, formats):
1729 for f in formats:
1730 self.to_screen('[info] Testing format %s' % f['format_id'])
1731 temp_file = tempfile.NamedTemporaryFile(
1732 suffix='.tmp', delete=False,
1733 dir=self.get_output_path('temp') or None)
1734 temp_file.close()
1735 try:
1736 success, _ = self.dl(temp_file.name, f, test=True)
1737 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1738 success = False
1739 finally:
1740 if os.path.exists(temp_file.name):
1741 try:
1742 os.remove(temp_file.name)
1743 except OSError:
1744 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1745 if success:
1746 yield f
1747 else:
1748 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1749
1750 def _default_format_spec(self, info_dict, download=True):
1751
1752 def can_merge():
1753 merger = FFmpegMergerPP(self)
1754 return merger.available and merger.can_merge()
1755
1756 prefer_best = (
1757 not self.params.get('simulate')
1758 and download
1759 and (
1760 not can_merge()
1761 or info_dict.get('is_live', False)
1762 or self.outtmpl_dict['default'] == '-'))
1763 compat = (
1764 prefer_best
1765 or self.params.get('allow_multiple_audio_streams', False)
1766 or 'format-spec' in self.params.get('compat_opts', []))
1767
1768 return (
1769 'best/bestvideo+bestaudio' if prefer_best
1770 else 'bestvideo*+bestaudio/best' if not compat
1771 else 'bestvideo+bestaudio/best')
1772
1773 def build_format_selector(self, format_spec):
1774 def syntax_error(note, start):
1775 message = (
1776 'Invalid format specification: '
1777 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1778 return SyntaxError(message)
1779
1780 PICKFIRST = 'PICKFIRST'
1781 MERGE = 'MERGE'
1782 SINGLE = 'SINGLE'
1783 GROUP = 'GROUP'
1784 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1785
1786 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1787 'video': self.params.get('allow_multiple_video_streams', False)}
1788
1789 check_formats = self.params.get('check_formats') == 'selected'
1790
1791 def _parse_filter(tokens):
1792 filter_parts = []
1793 for type, string, start, _, _ in tokens:
1794 if type == tokenize.OP and string == ']':
1795 return ''.join(filter_parts)
1796 else:
1797 filter_parts.append(string)
1798
1799 def _remove_unused_ops(tokens):
1800 # Remove operators that we don't use and join them with the surrounding strings
1801 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1802 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1803 last_string, last_start, last_end, last_line = None, None, None, None
1804 for type, string, start, end, line in tokens:
1805 if type == tokenize.OP and string == '[':
1806 if last_string:
1807 yield tokenize.NAME, last_string, last_start, last_end, last_line
1808 last_string = None
1809 yield type, string, start, end, line
1810 # everything inside brackets will be handled by _parse_filter
1811 for type, string, start, end, line in tokens:
1812 yield type, string, start, end, line
1813 if type == tokenize.OP and string == ']':
1814 break
1815 elif type == tokenize.OP and string in ALLOWED_OPS:
1816 if last_string:
1817 yield tokenize.NAME, last_string, last_start, last_end, last_line
1818 last_string = None
1819 yield type, string, start, end, line
1820 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1821 if not last_string:
1822 last_string = string
1823 last_start = start
1824 last_end = end
1825 else:
1826 last_string += string
1827 if last_string:
1828 yield tokenize.NAME, last_string, last_start, last_end, last_line
1829
1830 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1831 selectors = []
1832 current_selector = None
1833 for type, string, start, _, _ in tokens:
1834 # ENCODING is only defined in python 3.x
1835 if type == getattr(tokenize, 'ENCODING', None):
1836 continue
1837 elif type in [tokenize.NAME, tokenize.NUMBER]:
1838 current_selector = FormatSelector(SINGLE, string, [])
1839 elif type == tokenize.OP:
1840 if string == ')':
1841 if not inside_group:
1842 # ')' will be handled by the parentheses group
1843 tokens.restore_last_token()
1844 break
1845 elif inside_merge and string in ['/', ',']:
1846 tokens.restore_last_token()
1847 break
1848 elif inside_choice and string == ',':
1849 tokens.restore_last_token()
1850 break
1851 elif string == ',':
1852 if not current_selector:
1853 raise syntax_error('"," must follow a format selector', start)
1854 selectors.append(current_selector)
1855 current_selector = None
1856 elif string == '/':
1857 if not current_selector:
1858 raise syntax_error('"/" must follow a format selector', start)
1859 first_choice = current_selector
1860 second_choice = _parse_format_selection(tokens, inside_choice=True)
1861 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1862 elif string == '[':
1863 if not current_selector:
1864 current_selector = FormatSelector(SINGLE, 'best', [])
1865 format_filter = _parse_filter(tokens)
1866 current_selector.filters.append(format_filter)
1867 elif string == '(':
1868 if current_selector:
1869 raise syntax_error('Unexpected "("', start)
1870 group = _parse_format_selection(tokens, inside_group=True)
1871 current_selector = FormatSelector(GROUP, group, [])
1872 elif string == '+':
1873 if not current_selector:
1874 raise syntax_error('Unexpected "+"', start)
1875 selector_1 = current_selector
1876 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1877 if not selector_2:
1878 raise syntax_error('Expected a selector', start)
1879 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1880 else:
1881 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1882 elif type == tokenize.ENDMARKER:
1883 break
1884 if current_selector:
1885 selectors.append(current_selector)
1886 return selectors
1887
1888 def _merge(formats_pair):
1889 format_1, format_2 = formats_pair
1890
1891 formats_info = []
1892 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1893 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1894
1895 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1896 get_no_more = {'video': False, 'audio': False}
1897 for (i, fmt_info) in enumerate(formats_info):
1898 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1899 formats_info.pop(i)
1900 continue
1901 for aud_vid in ['audio', 'video']:
1902 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1903 if get_no_more[aud_vid]:
1904 formats_info.pop(i)
1905 break
1906 get_no_more[aud_vid] = True
1907
1908 if len(formats_info) == 1:
1909 return formats_info[0]
1910
1911 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1912 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1913
1914 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1915 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1916
1917 output_ext = self.params.get('merge_output_format')
1918 if not output_ext:
1919 if the_only_video:
1920 output_ext = the_only_video['ext']
1921 elif the_only_audio and not video_fmts:
1922 output_ext = the_only_audio['ext']
1923 else:
1924 output_ext = 'mkv'
1925
1926 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
1927
1928 new_dict = {
1929 'requested_formats': formats_info,
1930 'format': '+'.join(filtered('format')),
1931 'format_id': '+'.join(filtered('format_id')),
1932 'ext': output_ext,
1933 'protocol': '+'.join(map(determine_protocol, formats_info)),
1934 'language': '+'.join(orderedSet(filtered('language'))) or None,
1935 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
1936 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
1937 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
1938 }
1939
1940 if the_only_video:
1941 new_dict.update({
1942 'width': the_only_video.get('width'),
1943 'height': the_only_video.get('height'),
1944 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
1945 'fps': the_only_video.get('fps'),
1946 'dynamic_range': the_only_video.get('dynamic_range'),
1947 'vcodec': the_only_video.get('vcodec'),
1948 'vbr': the_only_video.get('vbr'),
1949 'stretched_ratio': the_only_video.get('stretched_ratio'),
1950 })
1951
1952 if the_only_audio:
1953 new_dict.update({
1954 'acodec': the_only_audio.get('acodec'),
1955 'abr': the_only_audio.get('abr'),
1956 'asr': the_only_audio.get('asr'),
1957 })
1958
1959 return new_dict
1960
1961 def _check_formats(formats):
1962 if not check_formats:
1963 yield from formats
1964 return
1965 yield from self._check_formats(formats)
1966
1967 def _build_selector_function(selector):
1968 if isinstance(selector, list): # ,
1969 fs = [_build_selector_function(s) for s in selector]
1970
1971 def selector_function(ctx):
1972 for f in fs:
1973 yield from f(ctx)
1974 return selector_function
1975
1976 elif selector.type == GROUP: # ()
1977 selector_function = _build_selector_function(selector.selector)
1978
1979 elif selector.type == PICKFIRST: # /
1980 fs = [_build_selector_function(s) for s in selector.selector]
1981
1982 def selector_function(ctx):
1983 for f in fs:
1984 picked_formats = list(f(ctx))
1985 if picked_formats:
1986 return picked_formats
1987 return []
1988
1989 elif selector.type == MERGE: # +
1990 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1991
1992 def selector_function(ctx):
1993 for pair in itertools.product(
1994 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1995 yield _merge(pair)
1996
1997 elif selector.type == SINGLE: # atom
1998 format_spec = selector.selector or 'best'
1999
2000 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2001 if format_spec == 'all':
2002 def selector_function(ctx):
2003 yield from _check_formats(ctx['formats'])
2004 elif format_spec == 'mergeall':
2005 def selector_function(ctx):
2006 formats = list(_check_formats(ctx['formats']))
2007 if not formats:
2008 return
2009 merged_format = formats[-1]
2010 for f in formats[-2::-1]:
2011 merged_format = _merge((merged_format, f))
2012 yield merged_format
2013
2014 else:
2015 format_fallback, format_reverse, format_idx = False, True, 1
2016 mobj = re.match(
2017 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2018 format_spec)
2019 if mobj is not None:
2020 format_idx = int_or_none(mobj.group('n'), default=1)
2021 format_reverse = mobj.group('bw')[0] == 'b'
2022 format_type = (mobj.group('type') or [None])[0]
2023 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2024 format_modified = mobj.group('mod') is not None
2025
2026 format_fallback = not format_type and not format_modified # for b, w
2027 _filter_f = (
2028 (lambda f: f.get('%scodec' % format_type) != 'none')
2029 if format_type and format_modified # bv*, ba*, wv*, wa*
2030 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
2031 if format_type # bv, ba, wv, wa
2032 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2033 if not format_modified # b, w
2034 else lambda f: True) # b*, w*
2035 filter_f = lambda f: _filter_f(f) and (
2036 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2037 else:
2038 if format_spec in self._format_selection_exts['audio']:
2039 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2040 elif format_spec in self._format_selection_exts['video']:
2041 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2042 elif format_spec in self._format_selection_exts['storyboards']:
2043 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2044 else:
2045 filter_f = lambda f: f.get('format_id') == format_spec # id
2046
2047 def selector_function(ctx):
2048 formats = list(ctx['formats'])
2049 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2050 if format_fallback and ctx['incomplete_formats'] and not matches:
2051 # for extractors with incomplete formats (audio only (soundcloud)
2052 # or video only (imgur)) best/worst will fallback to
2053 # best/worst {video,audio}-only format
2054 matches = formats
2055 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2056 try:
2057 yield matches[format_idx - 1]
2058 except IndexError:
2059 return
2060
2061 filters = [self._build_format_filter(f) for f in selector.filters]
2062
2063 def final_selector(ctx):
2064 ctx_copy = copy.deepcopy(ctx)
2065 for _filter in filters:
2066 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2067 return selector_function(ctx_copy)
2068 return final_selector
2069
2070 stream = io.BytesIO(format_spec.encode('utf-8'))
2071 try:
2072 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
2073 except tokenize.TokenError:
2074 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2075
2076 class TokenIterator(object):
2077 def __init__(self, tokens):
2078 self.tokens = tokens
2079 self.counter = 0
2080
2081 def __iter__(self):
2082 return self
2083
2084 def __next__(self):
2085 if self.counter >= len(self.tokens):
2086 raise StopIteration()
2087 value = self.tokens[self.counter]
2088 self.counter += 1
2089 return value
2090
2091 next = __next__
2092
2093 def restore_last_token(self):
2094 self.counter -= 1
2095
2096 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2097 return _build_selector_function(parsed_selector)
2098
2099 def _calc_headers(self, info_dict):
2100 res = std_headers.copy()
2101
2102 add_headers = info_dict.get('http_headers')
2103 if add_headers:
2104 res.update(add_headers)
2105
2106 cookies = self._calc_cookies(info_dict)
2107 if cookies:
2108 res['Cookie'] = cookies
2109
2110 if 'X-Forwarded-For' not in res:
2111 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2112 if x_forwarded_for_ip:
2113 res['X-Forwarded-For'] = x_forwarded_for_ip
2114
2115 return res
2116
2117 def _calc_cookies(self, info_dict):
2118 pr = sanitized_Request(info_dict['url'])
2119 self.cookiejar.add_cookie_header(pr)
2120 return pr.get_header('Cookie')
2121
2122 def _sort_thumbnails(self, thumbnails):
2123 thumbnails.sort(key=lambda t: (
2124 t.get('preference') if t.get('preference') is not None else -1,
2125 t.get('width') if t.get('width') is not None else -1,
2126 t.get('height') if t.get('height') is not None else -1,
2127 t.get('id') if t.get('id') is not None else '',
2128 t.get('url')))
2129
2130 def _sanitize_thumbnails(self, info_dict):
2131 thumbnails = info_dict.get('thumbnails')
2132 if thumbnails is None:
2133 thumbnail = info_dict.get('thumbnail')
2134 if thumbnail:
2135 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2136 if not thumbnails:
2137 return
2138
2139 def check_thumbnails(thumbnails):
2140 for t in thumbnails:
2141 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2142 try:
2143 self.urlopen(HEADRequest(t['url']))
2144 except network_exceptions as err:
2145 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2146 continue
2147 yield t
2148
2149 self._sort_thumbnails(thumbnails)
2150 for i, t in enumerate(thumbnails):
2151 if t.get('id') is None:
2152 t['id'] = '%d' % i
2153 if t.get('width') and t.get('height'):
2154 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2155 t['url'] = sanitize_url(t['url'])
2156
2157 if self.params.get('check_formats') is True:
2158 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1])).reverse()
2159 else:
2160 info_dict['thumbnails'] = thumbnails
2161
2162 def process_video_result(self, info_dict, download=True):
2163 assert info_dict.get('_type', 'video') == 'video'
2164
2165 if 'id' not in info_dict:
2166 raise ExtractorError('Missing "id" field in extractor result')
2167 if 'title' not in info_dict:
2168 raise ExtractorError('Missing "title" field in extractor result',
2169 video_id=info_dict['id'], ie=info_dict['extractor'])
2170
2171 def report_force_conversion(field, field_not, conversion):
2172 self.report_warning(
2173 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2174 % (field, field_not, conversion))
2175
2176 def sanitize_string_field(info, string_field):
2177 field = info.get(string_field)
2178 if field is None or isinstance(field, compat_str):
2179 return
2180 report_force_conversion(string_field, 'a string', 'string')
2181 info[string_field] = compat_str(field)
2182
2183 def sanitize_numeric_fields(info):
2184 for numeric_field in self._NUMERIC_FIELDS:
2185 field = info.get(numeric_field)
2186 if field is None or isinstance(field, compat_numeric_types):
2187 continue
2188 report_force_conversion(numeric_field, 'numeric', 'int')
2189 info[numeric_field] = int_or_none(field)
2190
2191 sanitize_string_field(info_dict, 'id')
2192 sanitize_numeric_fields(info_dict)
2193
2194 if 'playlist' not in info_dict:
2195 # It isn't part of a playlist
2196 info_dict['playlist'] = None
2197 info_dict['playlist_index'] = None
2198
2199 self._sanitize_thumbnails(info_dict)
2200
2201 thumbnail = info_dict.get('thumbnail')
2202 thumbnails = info_dict.get('thumbnails')
2203 if thumbnail:
2204 info_dict['thumbnail'] = sanitize_url(thumbnail)
2205 elif thumbnails:
2206 info_dict['thumbnail'] = thumbnails[-1]['url']
2207
2208 if info_dict.get('display_id') is None and 'id' in info_dict:
2209 info_dict['display_id'] = info_dict['id']
2210
2211 if info_dict.get('duration') is not None:
2212 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2213
2214 for ts_key, date_key in (
2215 ('timestamp', 'upload_date'),
2216 ('release_timestamp', 'release_date'),
2217 ):
2218 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2219 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2220 # see http://bugs.python.org/issue1646728)
2221 try:
2222 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2223 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2224 except (ValueError, OverflowError, OSError):
2225 pass
2226
2227 live_keys = ('is_live', 'was_live')
2228 live_status = info_dict.get('live_status')
2229 if live_status is None:
2230 for key in live_keys:
2231 if info_dict.get(key) is False:
2232 continue
2233 if info_dict.get(key):
2234 live_status = key
2235 break
2236 if all(info_dict.get(key) is False for key in live_keys):
2237 live_status = 'not_live'
2238 if live_status:
2239 info_dict['live_status'] = live_status
2240 for key in live_keys:
2241 if info_dict.get(key) is None:
2242 info_dict[key] = (live_status == key)
2243
2244 # Auto generate title fields corresponding to the *_number fields when missing
2245 # in order to always have clean titles. This is very common for TV series.
2246 for field in ('chapter', 'season', 'episode'):
2247 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2248 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2249
2250 for cc_kind in ('subtitles', 'automatic_captions'):
2251 cc = info_dict.get(cc_kind)
2252 if cc:
2253 for _, subtitle in cc.items():
2254 for subtitle_format in subtitle:
2255 if subtitle_format.get('url'):
2256 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2257 if subtitle_format.get('ext') is None:
2258 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2259
2260 automatic_captions = info_dict.get('automatic_captions')
2261 subtitles = info_dict.get('subtitles')
2262
2263 info_dict['requested_subtitles'] = self.process_subtitles(
2264 info_dict['id'], subtitles, automatic_captions)
2265
2266 if info_dict.get('formats') is None:
2267 # There's only one format available
2268 formats = [info_dict]
2269 else:
2270 formats = info_dict['formats']
2271
2272 info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
2273 if not self.params.get('allow_unplayable_formats'):
2274 formats = [f for f in formats if not f.get('has_drm')]
2275
2276 if not formats:
2277 self.raise_no_formats(info_dict)
2278
2279 def is_wellformed(f):
2280 url = f.get('url')
2281 if not url:
2282 self.report_warning(
2283 '"url" field is missing or empty - skipping format, '
2284 'there is an error in extractor')
2285 return False
2286 if isinstance(url, bytes):
2287 sanitize_string_field(f, 'url')
2288 return True
2289
2290 # Filter out malformed formats for better extraction robustness
2291 formats = list(filter(is_wellformed, formats))
2292
2293 formats_dict = {}
2294
2295 # We check that all the formats have the format and format_id fields
2296 for i, format in enumerate(formats):
2297 sanitize_string_field(format, 'format_id')
2298 sanitize_numeric_fields(format)
2299 format['url'] = sanitize_url(format['url'])
2300 if not format.get('format_id'):
2301 format['format_id'] = compat_str(i)
2302 else:
2303 # Sanitize format_id from characters used in format selector expression
2304 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2305 format_id = format['format_id']
2306 if format_id not in formats_dict:
2307 formats_dict[format_id] = []
2308 formats_dict[format_id].append(format)
2309
2310 # Make sure all formats have unique format_id
2311 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2312 for format_id, ambiguous_formats in formats_dict.items():
2313 ambigious_id = len(ambiguous_formats) > 1
2314 for i, format in enumerate(ambiguous_formats):
2315 if ambigious_id:
2316 format['format_id'] = '%s-%d' % (format_id, i)
2317 if format.get('ext') is None:
2318 format['ext'] = determine_ext(format['url']).lower()
2319 # Ensure there is no conflict between id and ext in format selection
2320 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2321 if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
2322 format['format_id'] = 'f%s' % format['format_id']
2323
2324 for i, format in enumerate(formats):
2325 if format.get('format') is None:
2326 format['format'] = '{id} - {res}{note}'.format(
2327 id=format['format_id'],
2328 res=self.format_resolution(format),
2329 note=format_field(format, 'format_note', ' (%s)'),
2330 )
2331 if format.get('protocol') is None:
2332 format['protocol'] = determine_protocol(format)
2333 if format.get('resolution') is None:
2334 format['resolution'] = self.format_resolution(format, default=None)
2335 if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
2336 format['dynamic_range'] = 'SDR'
2337 if (info_dict.get('duration') and format.get('tbr')
2338 and not format.get('filesize') and not format.get('filesize_approx')):
2339 format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8)
2340
2341 # Add HTTP headers, so that external programs can use them from the
2342 # json output
2343 full_format_info = info_dict.copy()
2344 full_format_info.update(format)
2345 format['http_headers'] = self._calc_headers(full_format_info)
2346 # Remove private housekeeping stuff
2347 if '__x_forwarded_for_ip' in info_dict:
2348 del info_dict['__x_forwarded_for_ip']
2349
2350 # TODO Central sorting goes here
2351
2352 if self.params.get('check_formats') is True:
2353 formats = LazyList(self._check_formats(formats[::-1])).reverse()
2354
2355 if not formats or formats[0] is not info_dict:
2356 # only set the 'formats' fields if the original info_dict list them
2357 # otherwise we end up with a circular reference, the first (and unique)
2358 # element in the 'formats' field in info_dict is info_dict itself,
2359 # which can't be exported to json
2360 info_dict['formats'] = formats
2361
2362 info_dict, _ = self.pre_process(info_dict)
2363
2364 # The pre-processors may have modified the formats
2365 formats = info_dict.get('formats', [info_dict])
2366
2367 if self.params.get('list_thumbnails'):
2368 self.list_thumbnails(info_dict)
2369 if self.params.get('listformats'):
2370 if not info_dict.get('formats') and not info_dict.get('url'):
2371 self.to_screen('%s has no formats' % info_dict['id'])
2372 else:
2373 self.list_formats(info_dict)
2374 if self.params.get('listsubtitles'):
2375 if 'automatic_captions' in info_dict:
2376 self.list_subtitles(
2377 info_dict['id'], automatic_captions, 'automatic captions')
2378 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2379 list_only = self.params.get('simulate') is None and (
2380 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2381 if list_only:
2382 # Without this printing, -F --print-json will not work
2383 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2384 return
2385
2386 format_selector = self.format_selector
2387 if format_selector is None:
2388 req_format = self._default_format_spec(info_dict, download=download)
2389 self.write_debug('Default format spec: %s' % req_format)
2390 format_selector = self.build_format_selector(req_format)
2391
2392 # While in format selection we may need to have an access to the original
2393 # format set in order to calculate some metrics or do some processing.
2394 # For now we need to be able to guess whether original formats provided
2395 # by extractor are incomplete or not (i.e. whether extractor provides only
2396 # video-only or audio-only formats) for proper formats selection for
2397 # extractors with such incomplete formats (see
2398 # https://github.com/ytdl-org/youtube-dl/pull/5556).
2399 # Since formats may be filtered during format selection and may not match
2400 # the original formats the results may be incorrect. Thus original formats
2401 # or pre-calculated metrics should be passed to format selection routines
2402 # as well.
2403 # We will pass a context object containing all necessary additional data
2404 # instead of just formats.
2405 # This fixes incorrect format selection issue (see
2406 # https://github.com/ytdl-org/youtube-dl/issues/10083).
2407 incomplete_formats = (
2408 # All formats are video-only or
2409 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2410 # all formats are audio-only
2411 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2412
2413 ctx = {
2414 'formats': formats,
2415 'incomplete_formats': incomplete_formats,
2416 }
2417
2418 formats_to_download = list(format_selector(ctx))
2419 if not formats_to_download:
2420 if not self.params.get('ignore_no_formats_error'):
2421 raise ExtractorError('Requested format is not available', expected=True,
2422 video_id=info_dict['id'], ie=info_dict['extractor'])
2423 else:
2424 self.report_warning('Requested format is not available')
2425 # Process what we can, even without any available formats.
2426 self.process_info(dict(info_dict))
2427 elif download:
2428 self.to_screen(
2429 '[info] %s: Downloading %d format(s): %s' % (
2430 info_dict['id'], len(formats_to_download),
2431 ", ".join([f['format_id'] for f in formats_to_download])))
2432 for fmt in formats_to_download:
2433 new_info = dict(info_dict)
2434 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2435 new_info['__original_infodict'] = info_dict
2436 new_info.update(fmt)
2437 self.process_info(new_info)
2438 # We update the info dict with the selected best quality format (backwards compatibility)
2439 if formats_to_download:
2440 info_dict.update(formats_to_download[-1])
2441 return info_dict
2442
2443 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2444 """Select the requested subtitles and their format"""
2445 available_subs = {}
2446 if normal_subtitles and self.params.get('writesubtitles'):
2447 available_subs.update(normal_subtitles)
2448 if automatic_captions and self.params.get('writeautomaticsub'):
2449 for lang, cap_info in automatic_captions.items():
2450 if lang not in available_subs:
2451 available_subs[lang] = cap_info
2452
2453 if (not self.params.get('writesubtitles') and not
2454 self.params.get('writeautomaticsub') or not
2455 available_subs):
2456 return None
2457
2458 all_sub_langs = available_subs.keys()
2459 if self.params.get('allsubtitles', False):
2460 requested_langs = all_sub_langs
2461 elif self.params.get('subtitleslangs', False):
2462 # A list is used so that the order of languages will be the same as
2463 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2464 requested_langs = []
2465 for lang_re in self.params.get('subtitleslangs'):
2466 if lang_re == 'all':
2467 requested_langs.extend(all_sub_langs)
2468 continue
2469 discard = lang_re[0] == '-'
2470 if discard:
2471 lang_re = lang_re[1:]
2472 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2473 if discard:
2474 for lang in current_langs:
2475 while lang in requested_langs:
2476 requested_langs.remove(lang)
2477 else:
2478 requested_langs.extend(current_langs)
2479 requested_langs = orderedSet(requested_langs)
2480 elif 'en' in available_subs:
2481 requested_langs = ['en']
2482 else:
2483 requested_langs = [list(all_sub_langs)[0]]
2484 if requested_langs:
2485 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2486
2487 formats_query = self.params.get('subtitlesformat', 'best')
2488 formats_preference = formats_query.split('/') if formats_query else []
2489 subs = {}
2490 for lang in requested_langs:
2491 formats = available_subs.get(lang)
2492 if formats is None:
2493 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2494 continue
2495 for ext in formats_preference:
2496 if ext == 'best':
2497 f = formats[-1]
2498 break
2499 matches = list(filter(lambda f: f['ext'] == ext, formats))
2500 if matches:
2501 f = matches[-1]
2502 break
2503 else:
2504 f = formats[-1]
2505 self.report_warning(
2506 'No subtitle format found matching "%s" for language %s, '
2507 'using %s' % (formats_query, lang, f['ext']))
2508 subs[lang] = f
2509 return subs
2510
2511 def __forced_printings(self, info_dict, filename, incomplete):
2512 def print_mandatory(field, actual_field=None):
2513 if actual_field is None:
2514 actual_field = field
2515 if (self.params.get('force%s' % field, False)
2516 and (not incomplete or info_dict.get(actual_field) is not None)):
2517 self.to_stdout(info_dict[actual_field])
2518
2519 def print_optional(field):
2520 if (self.params.get('force%s' % field, False)
2521 and info_dict.get(field) is not None):
2522 self.to_stdout(info_dict[field])
2523
2524 info_dict = info_dict.copy()
2525 if filename is not None:
2526 info_dict['filename'] = filename
2527 if info_dict.get('requested_formats') is not None:
2528 # For RTMP URLs, also include the playpath
2529 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2530 elif 'url' in info_dict:
2531 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2532
2533 if self.params.get('forceprint') or self.params.get('forcejson'):
2534 self.post_extract(info_dict)
2535 for tmpl in self.params.get('forceprint', []):
2536 mobj = re.match(r'\w+(=?)$', tmpl)
2537 if mobj and mobj.group(1):
2538 tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s'
2539 elif mobj:
2540 tmpl = '%({})s'.format(tmpl)
2541 self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict))
2542
2543 print_mandatory('title')
2544 print_mandatory('id')
2545 print_mandatory('url', 'urls')
2546 print_optional('thumbnail')
2547 print_optional('description')
2548 print_optional('filename')
2549 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2550 self.to_stdout(formatSeconds(info_dict['duration']))
2551 print_mandatory('format')
2552
2553 if self.params.get('forcejson'):
2554 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2555
2556 def dl(self, name, info, subtitle=False, test=False):
2557 if not info.get('url'):
2558 self.raise_no_formats(info, True)
2559
2560 if test:
2561 verbose = self.params.get('verbose')
2562 params = {
2563 'test': True,
2564 'quiet': self.params.get('quiet') or not verbose,
2565 'verbose': verbose,
2566 'noprogress': not verbose,
2567 'nopart': True,
2568 'skip_unavailable_fragments': False,
2569 'keep_fragments': False,
2570 'overwrites': True,
2571 '_no_ytdl_file': True,
2572 }
2573 else:
2574 params = self.params
2575 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2576 if not test:
2577 for ph in self._progress_hooks:
2578 fd.add_progress_hook(ph)
2579 urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2580 self.write_debug('Invoking downloader on "%s"' % urls)
2581
2582 new_info = copy.deepcopy(self._copy_infodict(info))
2583 if new_info.get('http_headers') is None:
2584 new_info['http_headers'] = self._calc_headers(new_info)
2585 return fd.download(name, new_info, subtitle)
2586
2587 def process_info(self, info_dict):
2588 """Process a single resolved IE result."""
2589
2590 assert info_dict.get('_type', 'video') == 'video'
2591
2592 max_downloads = self.params.get('max_downloads')
2593 if max_downloads is not None:
2594 if self._num_downloads >= int(max_downloads):
2595 raise MaxDownloadsReached()
2596
2597 # TODO: backward compatibility, to be removed
2598 info_dict['fulltitle'] = info_dict['title']
2599
2600 if 'format' not in info_dict and 'ext' in info_dict:
2601 info_dict['format'] = info_dict['ext']
2602
2603 if self._match_entry(info_dict) is not None:
2604 return
2605
2606 self.post_extract(info_dict)
2607 self._num_downloads += 1
2608
2609 # info_dict['_filename'] needs to be set for backward compatibility
2610 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2611 temp_filename = self.prepare_filename(info_dict, 'temp')
2612 files_to_move = {}
2613
2614 # Forced printings
2615 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2616
2617 if self.params.get('simulate'):
2618 if self.params.get('force_write_download_archive', False):
2619 self.record_download_archive(info_dict)
2620 # Do nothing else if in simulate mode
2621 return
2622
2623 if full_filename is None:
2624 return
2625 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2626 return
2627 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2628 return
2629
2630 if self._write_description('video', info_dict,
2631 self.prepare_filename(info_dict, 'description')) is None:
2632 return
2633
2634 sub_files = self._write_subtitles(info_dict, temp_filename)
2635 if sub_files is None:
2636 return
2637 files_to_move.update(dict(sub_files))
2638
2639 thumb_files = self._write_thumbnails(
2640 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2641 if thumb_files is None:
2642 return
2643 files_to_move.update(dict(thumb_files))
2644
2645 infofn = self.prepare_filename(info_dict, 'infojson')
2646 _infojson_written = self._write_info_json('video', info_dict, infofn)
2647 if _infojson_written:
2648 info_dict['__infojson_filename'] = infofn
2649 elif _infojson_written is None:
2650 return
2651
2652 # Note: Annotations are deprecated
2653 annofn = None
2654 if self.params.get('writeannotations', False):
2655 annofn = self.prepare_filename(info_dict, 'annotation')
2656 if annofn:
2657 if not self._ensure_dir_exists(encodeFilename(annofn)):
2658 return
2659 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2660 self.to_screen('[info] Video annotations are already present')
2661 elif not info_dict.get('annotations'):
2662 self.report_warning('There are no annotations to write.')
2663 else:
2664 try:
2665 self.to_screen('[info] Writing video annotations to: ' + annofn)
2666 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2667 annofile.write(info_dict['annotations'])
2668 except (KeyError, TypeError):
2669 self.report_warning('There are no annotations to write.')
2670 except (OSError, IOError):
2671 self.report_error('Cannot write annotations file: ' + annofn)
2672 return
2673
2674 # Write internet shortcut files
2675 def _write_link_file(link_type):
2676 if 'webpage_url' not in info_dict:
2677 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2678 return False
2679 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
2680 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2681 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
2682 return True
2683 try:
2684 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
2685 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
2686 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
2687 template_vars = {'url': iri_to_uri(info_dict['webpage_url'])}
2688 if link_type == 'desktop':
2689 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
2690 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
2691 except (OSError, IOError):
2692 self.report_error(f'Cannot write internet shortcut {linkfn}')
2693 return False
2694 return True
2695
2696 write_links = {
2697 'url': self.params.get('writeurllink'),
2698 'webloc': self.params.get('writewebloclink'),
2699 'desktop': self.params.get('writedesktoplink'),
2700 }
2701 if self.params.get('writelink'):
2702 link_type = ('webloc' if sys.platform == 'darwin'
2703 else 'desktop' if sys.platform.startswith('linux')
2704 else 'url')
2705 write_links[link_type] = True
2706
2707 if any(should_write and not _write_link_file(link_type)
2708 for link_type, should_write in write_links.items()):
2709 return
2710
2711 try:
2712 info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2713 except PostProcessingError as err:
2714 self.report_error('Preprocessing: %s' % str(err))
2715 return
2716
2717 must_record_download_archive = False
2718 if self.params.get('skip_download', False):
2719 info_dict['filepath'] = temp_filename
2720 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2721 info_dict['__files_to_move'] = files_to_move
2722 info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2723 else:
2724 # Download
2725 info_dict.setdefault('__postprocessors', [])
2726 try:
2727
2728 def existing_file(*filepaths):
2729 ext = info_dict.get('ext')
2730 final_ext = self.params.get('final_ext', ext)
2731 existing_files = []
2732 for file in orderedSet(filepaths):
2733 if final_ext != ext:
2734 converted = replace_extension(file, final_ext, ext)
2735 if os.path.exists(encodeFilename(converted)):
2736 existing_files.append(converted)
2737 if os.path.exists(encodeFilename(file)):
2738 existing_files.append(file)
2739
2740 if not existing_files or self.params.get('overwrites', False):
2741 for file in orderedSet(existing_files):
2742 self.report_file_delete(file)
2743 os.remove(encodeFilename(file))
2744 return None
2745
2746 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2747 return existing_files[0]
2748
2749 success = True
2750 if info_dict.get('requested_formats') is not None:
2751
2752 def compatible_formats(formats):
2753 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2754 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2755 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2756 if len(video_formats) > 2 or len(audio_formats) > 2:
2757 return False
2758
2759 # Check extension
2760 exts = set(format.get('ext') for format in formats)
2761 COMPATIBLE_EXTS = (
2762 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2763 set(('webm',)),
2764 )
2765 for ext_sets in COMPATIBLE_EXTS:
2766 if ext_sets.issuperset(exts):
2767 return True
2768 # TODO: Check acodec/vcodec
2769 return False
2770
2771 requested_formats = info_dict['requested_formats']
2772 old_ext = info_dict['ext']
2773 if self.params.get('merge_output_format') is None:
2774 if not compatible_formats(requested_formats):
2775 info_dict['ext'] = 'mkv'
2776 self.report_warning(
2777 'Requested formats are incompatible for merge and will be merged into mkv')
2778 if (info_dict['ext'] == 'webm'
2779 and info_dict.get('thumbnails')
2780 # check with type instead of pp_key, __name__, or isinstance
2781 # since we dont want any custom PPs to trigger this
2782 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])):
2783 info_dict['ext'] = 'mkv'
2784 self.report_warning(
2785 'webm doesn\'t support embedding a thumbnail, mkv will be used')
2786 new_ext = info_dict['ext']
2787
2788 def correct_ext(filename, ext=new_ext):
2789 if filename == '-':
2790 return filename
2791 filename_real_ext = os.path.splitext(filename)[1][1:]
2792 filename_wo_ext = (
2793 os.path.splitext(filename)[0]
2794 if filename_real_ext in (old_ext, new_ext)
2795 else filename)
2796 return '%s.%s' % (filename_wo_ext, ext)
2797
2798 # Ensure filename always has a correct extension for successful merge
2799 full_filename = correct_ext(full_filename)
2800 temp_filename = correct_ext(temp_filename)
2801 dl_filename = existing_file(full_filename, temp_filename)
2802 info_dict['__real_download'] = False
2803
2804 if dl_filename is not None:
2805 self.report_file_already_downloaded(dl_filename)
2806 elif get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-'):
2807 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2808 success, real_download = self.dl(temp_filename, info_dict)
2809 info_dict['__real_download'] = real_download
2810 else:
2811 downloaded = []
2812 merger = FFmpegMergerPP(self)
2813 if self.params.get('allow_unplayable_formats'):
2814 self.report_warning(
2815 'You have requested merging of multiple formats '
2816 'while also allowing unplayable formats to be downloaded. '
2817 'The formats won\'t be merged to prevent data corruption.')
2818 elif not merger.available:
2819 self.report_warning(
2820 'You have requested merging of multiple formats but ffmpeg is not installed. '
2821 'The formats won\'t be merged.')
2822
2823 if temp_filename == '-':
2824 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict)
2825 else 'but the formats are incompatible for simultaneous download' if merger.available
2826 else 'but ffmpeg is not installed')
2827 self.report_warning(
2828 f'You have requested downloading multiple formats to stdout {reason}. '
2829 'The formats will be streamed one after the other')
2830 fname = temp_filename
2831 for f in requested_formats:
2832 new_info = dict(info_dict)
2833 del new_info['requested_formats']
2834 new_info.update(f)
2835 if temp_filename != '-':
2836 fname = prepend_extension(
2837 correct_ext(temp_filename, new_info['ext']),
2838 'f%s' % f['format_id'], new_info['ext'])
2839 if not self._ensure_dir_exists(fname):
2840 return
2841 f['filepath'] = fname
2842 downloaded.append(fname)
2843 partial_success, real_download = self.dl(fname, new_info)
2844 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2845 success = success and partial_success
2846 if merger.available and not self.params.get('allow_unplayable_formats'):
2847 info_dict['__postprocessors'].append(merger)
2848 info_dict['__files_to_merge'] = downloaded
2849 # Even if there were no downloads, it is being merged only now
2850 info_dict['__real_download'] = True
2851 else:
2852 for file in downloaded:
2853 files_to_move[file] = None
2854 else:
2855 # Just a single file
2856 dl_filename = existing_file(full_filename, temp_filename)
2857 if dl_filename is None or dl_filename == temp_filename:
2858 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
2859 # So we should try to resume the download
2860 success, real_download = self.dl(temp_filename, info_dict)
2861 info_dict['__real_download'] = real_download
2862 else:
2863 self.report_file_already_downloaded(dl_filename)
2864
2865 dl_filename = dl_filename or temp_filename
2866 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2867
2868 except network_exceptions as err:
2869 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2870 return
2871 except (OSError, IOError) as err:
2872 raise UnavailableVideoError(err)
2873 except (ContentTooShortError, ) as err:
2874 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2875 return
2876
2877 if success and full_filename != '-':
2878
2879 def fixup():
2880 do_fixup = True
2881 fixup_policy = self.params.get('fixup')
2882 vid = info_dict['id']
2883
2884 if fixup_policy in ('ignore', 'never'):
2885 return
2886 elif fixup_policy == 'warn':
2887 do_fixup = False
2888 elif fixup_policy != 'force':
2889 assert fixup_policy in ('detect_or_warn', None)
2890 if not info_dict.get('__real_download'):
2891 do_fixup = False
2892
2893 def ffmpeg_fixup(cndn, msg, cls):
2894 if not cndn:
2895 return
2896 if not do_fixup:
2897 self.report_warning(f'{vid}: {msg}')
2898 return
2899 pp = cls(self)
2900 if pp.available:
2901 info_dict['__postprocessors'].append(pp)
2902 else:
2903 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
2904
2905 stretched_ratio = info_dict.get('stretched_ratio')
2906 ffmpeg_fixup(
2907 stretched_ratio not in (1, None),
2908 f'Non-uniform pixel ratio {stretched_ratio}',
2909 FFmpegFixupStretchedPP)
2910
2911 ffmpeg_fixup(
2912 (info_dict.get('requested_formats') is None
2913 and info_dict.get('container') == 'm4a_dash'
2914 and info_dict.get('ext') == 'm4a'),
2915 'writing DASH m4a. Only some players support this container',
2916 FFmpegFixupM4aPP)
2917
2918 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
2919 downloader = downloader.__name__ if downloader else None
2920 ffmpeg_fixup(info_dict.get('requested_formats') is None and downloader == 'HlsFD',
2921 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
2922 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
2923 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
2924
2925 fixup()
2926 try:
2927 info_dict = self.post_process(dl_filename, info_dict, files_to_move)
2928 except PostProcessingError as err:
2929 self.report_error('Postprocessing: %s' % str(err))
2930 return
2931 try:
2932 for ph in self._post_hooks:
2933 ph(info_dict['filepath'])
2934 except Exception as err:
2935 self.report_error('post hooks: %s' % str(err))
2936 return
2937 must_record_download_archive = True
2938
2939 if must_record_download_archive or self.params.get('force_write_download_archive', False):
2940 self.record_download_archive(info_dict)
2941 max_downloads = self.params.get('max_downloads')
2942 if max_downloads is not None and self._num_downloads >= int(max_downloads):
2943 raise MaxDownloadsReached()
2944
2945 def __download_wrapper(self, func):
2946 @functools.wraps(func)
2947 def wrapper(*args, **kwargs):
2948 try:
2949 res = func(*args, **kwargs)
2950 except UnavailableVideoError as e:
2951 self.report_error(e)
2952 except DownloadCancelled as e:
2953 self.to_screen(f'[info] {e}')
2954 raise
2955 else:
2956 if self.params.get('dump_single_json', False):
2957 self.post_extract(res)
2958 self.to_stdout(json.dumps(self.sanitize_info(res)))
2959 return wrapper
2960
2961 def download(self, url_list):
2962 """Download a given list of URLs."""
2963 url_list = variadic(url_list) # Passing a single URL is a common mistake
2964 outtmpl = self.outtmpl_dict['default']
2965 if (len(url_list) > 1
2966 and outtmpl != '-'
2967 and '%' not in outtmpl
2968 and self.params.get('max_downloads') != 1):
2969 raise SameFileError(outtmpl)
2970
2971 for url in url_list:
2972 self.__download_wrapper(self.extract_info)(
2973 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2974
2975 return self._download_retcode
2976
2977 def download_with_info_file(self, info_filename):
2978 with contextlib.closing(fileinput.FileInput(
2979 [info_filename], mode='r',
2980 openhook=fileinput.hook_encoded('utf-8'))) as f:
2981 # FileInput doesn't have a read method, we can't call json.load
2982 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
2983 try:
2984 self.__download_wrapper(self.process_ie_result)(info, download=True)
2985 except (DownloadError, EntryNotInPlaylist, ThrottledDownload) as e:
2986 self.to_stderr('\r')
2987 webpage_url = info.get('webpage_url')
2988 if webpage_url is not None:
2989 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
2990 return self.download([webpage_url])
2991 else:
2992 raise
2993 return self._download_retcode
2994
2995 @staticmethod
2996 def sanitize_info(info_dict, remove_private_keys=False):
2997 ''' Sanitize the infodict for converting to json '''
2998 if info_dict is None:
2999 return info_dict
3000 info_dict.setdefault('epoch', int(time.time()))
3001 remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
3002 keep_keys = ['_type'] # Always keep this to facilitate load-info-json
3003 if remove_private_keys:
3004 remove_keys |= {
3005 'requested_formats', 'requested_subtitles', 'requested_entries',
3006 'filepath', 'entries', 'original_url', 'playlist_autonumber',
3007 }
3008 empty_values = (None, {}, [], set(), tuple())
3009 reject = lambda k, v: k not in keep_keys and (
3010 k.startswith('_') or k in remove_keys or v in empty_values)
3011 else:
3012 reject = lambda k, v: k in remove_keys
3013 filter_fn = lambda obj: (
3014 list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
3015 else obj if not isinstance(obj, dict)
3016 else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
3017 return filter_fn(info_dict)
3018
3019 @staticmethod
3020 def filter_requested_info(info_dict, actually_filter=True):
3021 ''' Alias of sanitize_info for backward compatibility '''
3022 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3023
3024 def run_pp(self, pp, infodict):
3025 files_to_delete = []
3026 if '__files_to_move' not in infodict:
3027 infodict['__files_to_move'] = {}
3028 try:
3029 files_to_delete, infodict = pp.run(infodict)
3030 except PostProcessingError as e:
3031 # Must be True and not 'only_download'
3032 if self.params.get('ignoreerrors') is True:
3033 self.report_error(e)
3034 return infodict
3035 raise
3036
3037 if not files_to_delete:
3038 return infodict
3039 if self.params.get('keepvideo', False):
3040 for f in files_to_delete:
3041 infodict['__files_to_move'].setdefault(f, '')
3042 else:
3043 for old_filename in set(files_to_delete):
3044 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
3045 try:
3046 os.remove(encodeFilename(old_filename))
3047 except (IOError, OSError):
3048 self.report_warning('Unable to remove downloaded original file')
3049 if old_filename in infodict['__files_to_move']:
3050 del infodict['__files_to_move'][old_filename]
3051 return infodict
3052
3053 @staticmethod
3054 def post_extract(info_dict):
3055 def actual_post_extract(info_dict):
3056 if info_dict.get('_type') in ('playlist', 'multi_video'):
3057 for video_dict in info_dict.get('entries', {}):
3058 actual_post_extract(video_dict or {})
3059 return
3060
3061 post_extractor = info_dict.get('__post_extractor') or (lambda: {})
3062 extra = post_extractor().items()
3063 info_dict.update(extra)
3064 info_dict.pop('__post_extractor', None)
3065
3066 original_infodict = info_dict.get('__original_infodict') or {}
3067 original_infodict.update(extra)
3068 original_infodict.pop('__post_extractor', None)
3069
3070 actual_post_extract(info_dict or {})
3071
3072 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3073 info = dict(ie_info)
3074 info['__files_to_move'] = files_to_move or {}
3075 for pp in self._pps[key]:
3076 info = self.run_pp(pp, info)
3077 return info, info.pop('__files_to_move', None)
3078
3079 def post_process(self, filename, ie_info, files_to_move=None):
3080 """Run all the postprocessors on the given file."""
3081 info = dict(ie_info)
3082 info['filepath'] = filename
3083 info['__files_to_move'] = files_to_move or {}
3084
3085 for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
3086 info = self.run_pp(pp, info)
3087 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3088 del info['__files_to_move']
3089 for pp in self._pps['after_move']:
3090 info = self.run_pp(pp, info)
3091 return info
3092
3093 def _make_archive_id(self, info_dict):
3094 video_id = info_dict.get('id')
3095 if not video_id:
3096 return
3097 # Future-proof against any change in case
3098 # and backwards compatibility with prior versions
3099 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3100 if extractor is None:
3101 url = str_or_none(info_dict.get('url'))
3102 if not url:
3103 return
3104 # Try to find matching extractor for the URL and take its ie_key
3105 for ie_key, ie in self._ies.items():
3106 if ie.suitable(url):
3107 extractor = ie_key
3108 break
3109 else:
3110 return
3111 return '%s %s' % (extractor.lower(), video_id)
3112
3113 def in_download_archive(self, info_dict):
3114 fn = self.params.get('download_archive')
3115 if fn is None:
3116 return False
3117
3118 vid_id = self._make_archive_id(info_dict)
3119 if not vid_id:
3120 return False # Incomplete video information
3121
3122 return vid_id in self.archive
3123
3124 def record_download_archive(self, info_dict):
3125 fn = self.params.get('download_archive')
3126 if fn is None:
3127 return
3128 vid_id = self._make_archive_id(info_dict)
3129 assert vid_id
3130 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3131 archive_file.write(vid_id + '\n')
3132 self.archive.add(vid_id)
3133
3134 @staticmethod
3135 def format_resolution(format, default='unknown'):
3136 is_images = format.get('vcodec') == 'none' and format.get('acodec') == 'none'
3137 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3138 return 'audio only'
3139 if format.get('resolution') is not None:
3140 return format['resolution']
3141 if format.get('width') and format.get('height'):
3142 res = '%dx%d' % (format['width'], format['height'])
3143 elif format.get('height'):
3144 res = '%sp' % format['height']
3145 elif format.get('width'):
3146 res = '%dx?' % format['width']
3147 elif is_images:
3148 return 'images'
3149 else:
3150 return default
3151 return f'{res} images' if is_images else res
3152
3153 def _format_note(self, fdict):
3154 res = ''
3155 if fdict.get('ext') in ['f4f', 'f4m']:
3156 res += '(unsupported) '
3157 if fdict.get('language'):
3158 if res:
3159 res += ' '
3160 res += '[%s] ' % fdict['language']
3161 if fdict.get('format_note') is not None:
3162 res += fdict['format_note'] + ' '
3163 if fdict.get('tbr') is not None:
3164 res += '%4dk ' % fdict['tbr']
3165 if fdict.get('container') is not None:
3166 if res:
3167 res += ', '
3168 res += '%s container' % fdict['container']
3169 if (fdict.get('vcodec') is not None
3170 and fdict.get('vcodec') != 'none'):
3171 if res:
3172 res += ', '
3173 res += fdict['vcodec']
3174 if fdict.get('vbr') is not None:
3175 res += '@'
3176 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3177 res += 'video@'
3178 if fdict.get('vbr') is not None:
3179 res += '%4dk' % fdict['vbr']
3180 if fdict.get('fps') is not None:
3181 if res:
3182 res += ', '
3183 res += '%sfps' % fdict['fps']
3184 if fdict.get('acodec') is not None:
3185 if res:
3186 res += ', '
3187 if fdict['acodec'] == 'none':
3188 res += 'video only'
3189 else:
3190 res += '%-5s' % fdict['acodec']
3191 elif fdict.get('abr') is not None:
3192 if res:
3193 res += ', '
3194 res += 'audio'
3195 if fdict.get('abr') is not None:
3196 res += '@%3dk' % fdict['abr']
3197 if fdict.get('asr') is not None:
3198 res += ' (%5dHz)' % fdict['asr']
3199 if fdict.get('filesize') is not None:
3200 if res:
3201 res += ', '
3202 res += format_bytes(fdict['filesize'])
3203 elif fdict.get('filesize_approx') is not None:
3204 if res:
3205 res += ', '
3206 res += '~' + format_bytes(fdict['filesize_approx'])
3207 return res
3208
3209 def _list_format_headers(self, *headers):
3210 if self.params.get('listformats_table', True) is not False:
3211 return [self._format_screen(header, self.Styles.HEADERS) for header in headers]
3212 return headers
3213
3214 def list_formats(self, info_dict):
3215 formats = info_dict.get('formats', [info_dict])
3216 new_format = self.params.get('listformats_table', True) is not False
3217 if new_format:
3218 tbr_digits = number_of_digits(max(f.get('tbr') or 0 for f in formats))
3219 vbr_digits = number_of_digits(max(f.get('vbr') or 0 for f in formats))
3220 abr_digits = number_of_digits(max(f.get('abr') or 0 for f in formats))
3221 delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3222 table = [
3223 [
3224 self._format_screen(format_field(f, 'format_id'), self.Styles.ID),
3225 format_field(f, 'ext'),
3226 self.format_resolution(f),
3227 format_field(f, 'fps', '%3d'),
3228 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3229 delim,
3230 format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
3231 format_field(f, 'tbr', f'%{tbr_digits}dk'),
3232 shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
3233 delim,
3234 format_field(f, 'vcodec', default='unknown').replace('none', ''),
3235 format_field(f, 'vbr', f'%{vbr_digits}dk'),
3236 format_field(f, 'acodec', default='unknown').replace('none', ''),
3237 format_field(f, 'abr', f'%{abr_digits}dk'),
3238 format_field(f, 'asr', '%5dHz'),
3239 join_nonempty(
3240 self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
3241 format_field(f, 'language', '[%s]'),
3242 format_field(f, 'format_note'),
3243 format_field(f, 'container', ignore=(None, f.get('ext'))),
3244 delim=', '),
3245 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3246 header_line = self._list_format_headers(
3247 'ID', 'EXT', 'RESOLUTION', 'FPS', 'HDR', delim, ' FILESIZE', ' TBR', 'PROTO',
3248 delim, 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO')
3249 else:
3250 table = [
3251 [
3252 format_field(f, 'format_id'),
3253 format_field(f, 'ext'),
3254 self.format_resolution(f),
3255 self._format_note(f)]
3256 for f in formats
3257 if f.get('preference') is None or f['preference'] >= -1000]
3258 header_line = ['format code', 'extension', 'resolution', 'note']
3259
3260 self.to_screen(
3261 '[info] Available formats for %s:' % info_dict['id'])
3262 self.to_stdout(render_table(
3263 header_line, table,
3264 extraGap=(0 if new_format else 1),
3265 hideEmpty=new_format,
3266 delim=new_format and self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True)))
3267
3268 def list_thumbnails(self, info_dict):
3269 thumbnails = list(info_dict.get('thumbnails'))
3270 if not thumbnails:
3271 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3272 return
3273
3274 self.to_screen(
3275 '[info] Thumbnails for %s:' % info_dict['id'])
3276 self.to_stdout(render_table(
3277 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3278 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3279
3280 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3281 if not subtitles:
3282 self.to_screen('%s has no %s' % (video_id, name))
3283 return
3284 self.to_screen(
3285 'Available %s for %s:' % (name, video_id))
3286
3287 def _row(lang, formats):
3288 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3289 if len(set(names)) == 1:
3290 names = [] if names[0] == 'unknown' else names[:1]
3291 return [lang, ', '.join(names), ', '.join(exts)]
3292
3293 self.to_stdout(render_table(
3294 self._list_format_headers('Language', 'Name', 'Formats'),
3295 [_row(lang, formats) for lang, formats in subtitles.items()],
3296 hideEmpty=True))
3297
3298 def urlopen(self, req):
3299 """ Start an HTTP download """
3300 if isinstance(req, compat_basestring):
3301 req = sanitized_Request(req)
3302 return self._opener.open(req, timeout=self._socket_timeout)
3303
3304 def print_debug_header(self):
3305 if not self.params.get('verbose'):
3306 return
3307
3308 def get_encoding(stream):
3309 ret = getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)
3310 if not supports_terminal_sequences(stream):
3311 ret += ' (No ANSI)'
3312 return ret
3313
3314 encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % (
3315 locale.getpreferredencoding(),
3316 sys.getfilesystemencoding(),
3317 get_encoding(self._screen_file), get_encoding(self._err_file),
3318 self.get_encoding())
3319
3320 logger = self.params.get('logger')
3321 if logger:
3322 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
3323 write_debug(encoding_str)
3324 else:
3325 write_string(f'[debug] {encoding_str}\n', encoding=None)
3326 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
3327
3328 source = detect_variant()
3329 write_debug('yt-dlp version %s%s' % (__version__, '' if source == 'unknown' else f' ({source})'))
3330 if not _LAZY_LOADER:
3331 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
3332 write_debug('Lazy loading extractors is forcibly disabled')
3333 else:
3334 write_debug('Lazy loading extractors is disabled')
3335 if plugin_extractors or plugin_postprocessors:
3336 write_debug('Plugins: %s' % [
3337 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3338 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3339 if self.params.get('compat_opts'):
3340 write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts')))
3341 try:
3342 sp = Popen(
3343 ['git', 'rev-parse', '--short', 'HEAD'],
3344 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3345 cwd=os.path.dirname(os.path.abspath(__file__)))
3346 out, err = sp.communicate_or_kill()
3347 out = out.decode().strip()
3348 if re.match('[0-9a-f]+', out):
3349 write_debug('Git HEAD: %s' % out)
3350 except Exception:
3351 try:
3352 sys.exc_clear()
3353 except Exception:
3354 pass
3355
3356 def python_implementation():
3357 impl_name = platform.python_implementation()
3358 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3359 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3360 return impl_name
3361
3362 write_debug('Python version %s (%s %s) - %s' % (
3363 platform.python_version(),
3364 python_implementation(),
3365 platform.architecture()[0],
3366 platform_name()))
3367
3368 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
3369 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
3370 if ffmpeg_features:
3371 exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features)
3372
3373 exe_versions['rtmpdump'] = rtmpdump_version()
3374 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3375 exe_str = ', '.join(
3376 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3377 ) or 'none'
3378 write_debug('exe versions: %s' % exe_str)
3379
3380 from .downloader.websocket import has_websockets
3381 from .postprocessor.embedthumbnail import has_mutagen
3382 from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
3383
3384 lib_str = join_nonempty(
3385 compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0],
3386 KEYRING_AVAILABLE and 'keyring',
3387 has_mutagen and 'mutagen',
3388 SQLITE_AVAILABLE and 'sqlite',
3389 has_websockets and 'websockets',
3390 delim=', ') or 'none'
3391 write_debug('Optional libraries: %s' % lib_str)
3392
3393 proxy_map = {}
3394 for handler in self._opener.handlers:
3395 if hasattr(handler, 'proxies'):
3396 proxy_map.update(handler.proxies)
3397 write_debug(f'Proxy map: {proxy_map}')
3398
3399 # Not implemented
3400 if False and self.params.get('call_home'):
3401 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3402 write_debug('Public IP address: %s' % ipaddr)
3403 latest_version = self.urlopen(
3404 'https://yt-dl.org/latest/version').read().decode('utf-8')
3405 if version_tuple(latest_version) > version_tuple(__version__):
3406 self.report_warning(
3407 'You are using an outdated version (newest version: %s)! '
3408 'See https://yt-dl.org/update if you need help updating.' %
3409 latest_version)
3410
3411 def _setup_opener(self):
3412 timeout_val = self.params.get('socket_timeout')
3413 self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
3414
3415 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3416 opts_cookiefile = self.params.get('cookiefile')
3417 opts_proxy = self.params.get('proxy')
3418
3419 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3420
3421 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3422 if opts_proxy is not None:
3423 if opts_proxy == '':
3424 proxies = {}
3425 else:
3426 proxies = {'http': opts_proxy, 'https': opts_proxy}
3427 else:
3428 proxies = compat_urllib_request.getproxies()
3429 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3430 if 'http' in proxies and 'https' not in proxies:
3431 proxies['https'] = proxies['http']
3432 proxy_handler = PerRequestProxyHandler(proxies)
3433
3434 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3435 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3436 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3437 redirect_handler = YoutubeDLRedirectHandler()
3438 data_handler = compat_urllib_request_DataHandler()
3439
3440 # When passing our own FileHandler instance, build_opener won't add the
3441 # default FileHandler and allows us to disable the file protocol, which
3442 # can be used for malicious purposes (see
3443 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3444 file_handler = compat_urllib_request.FileHandler()
3445
3446 def file_open(*args, **kwargs):
3447 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3448 file_handler.file_open = file_open
3449
3450 opener = compat_urllib_request.build_opener(
3451 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3452
3453 # Delete the default user-agent header, which would otherwise apply in
3454 # cases where our custom HTTP handler doesn't come into play
3455 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3456 opener.addheaders = []
3457 self._opener = opener
3458
3459 def encode(self, s):
3460 if isinstance(s, bytes):
3461 return s # Already encoded
3462
3463 try:
3464 return s.encode(self.get_encoding())
3465 except UnicodeEncodeError as err:
3466 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3467 raise
3468
3469 def get_encoding(self):
3470 encoding = self.params.get('encoding')
3471 if encoding is None:
3472 encoding = preferredencoding()
3473 return encoding
3474
3475 def _write_info_json(self, label, ie_result, infofn):
3476 ''' Write infojson and returns True = written, False = skip, None = error '''
3477 if not self.params.get('writeinfojson'):
3478 return False
3479 elif not infofn:
3480 self.write_debug(f'Skipping writing {label} infojson')
3481 return False
3482 elif not self._ensure_dir_exists(infofn):
3483 return None
3484 elif not self.params.get('overwrites', True) and os.path.exists(infofn):
3485 self.to_screen(f'[info] {label.title()} metadata is already present')
3486 else:
3487 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3488 try:
3489 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3490 except (OSError, IOError):
3491 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3492 return None
3493 return True
3494
3495 def _write_description(self, label, ie_result, descfn):
3496 ''' Write description and returns True = written, False = skip, None = error '''
3497 if not self.params.get('writedescription'):
3498 return False
3499 elif not descfn:
3500 self.write_debug(f'Skipping writing {label} description')
3501 return False
3502 elif not self._ensure_dir_exists(descfn):
3503 return None
3504 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3505 self.to_screen(f'[info] {label.title()} description is already present')
3506 elif ie_result.get('description') is None:
3507 self.report_warning(f'There\'s no {label} description to write')
3508 return False
3509 else:
3510 try:
3511 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3512 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3513 descfile.write(ie_result['description'])
3514 except (OSError, IOError):
3515 self.report_error(f'Cannot write {label} description file {descfn}')
3516 return None
3517 return True
3518
3519 def _write_subtitles(self, info_dict, filename):
3520 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3521 ret = []
3522 subtitles = info_dict.get('requested_subtitles')
3523 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3524 # subtitles download errors are already managed as troubles in relevant IE
3525 # that way it will silently go on when used with unsupporting IE
3526 return ret
3527
3528 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3529 if not sub_filename_base:
3530 self.to_screen('[info] Skipping writing video subtitles')
3531 return ret
3532 for sub_lang, sub_info in subtitles.items():
3533 sub_format = sub_info['ext']
3534 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3535 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3536 if not self.params.get('overwrites', True) and os.path.exists(sub_filename):
3537 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3538 sub_info['filepath'] = sub_filename
3539 ret.append((sub_filename, sub_filename_final))
3540 continue
3541
3542 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3543 if sub_info.get('data') is not None:
3544 try:
3545 # Use newline='' to prevent conversion of newline characters
3546 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3547 with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3548 subfile.write(sub_info['data'])
3549 sub_info['filepath'] = sub_filename
3550 ret.append((sub_filename, sub_filename_final))
3551 continue
3552 except (OSError, IOError):
3553 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3554 return None
3555
3556 try:
3557 sub_copy = sub_info.copy()
3558 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3559 self.dl(sub_filename, sub_copy, subtitle=True)
3560 sub_info['filepath'] = sub_filename
3561 ret.append((sub_filename, sub_filename_final))
3562 except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3563 self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
3564 continue
3565 return ret
3566
3567 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3568 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3569 write_all = self.params.get('write_all_thumbnails', False)
3570 thumbnails, ret = [], []
3571 if write_all or self.params.get('writethumbnail', False):
3572 thumbnails = info_dict.get('thumbnails') or []
3573 multiple = write_all and len(thumbnails) > 1
3574
3575 if thumb_filename_base is None:
3576 thumb_filename_base = filename
3577 if thumbnails and not thumb_filename_base:
3578 self.write_debug(f'Skipping writing {label} thumbnail')
3579 return ret
3580
3581 for t in thumbnails[::-1]:
3582 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3583 thumb_display_id = f'{label} thumbnail {t["id"]}'
3584 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3585 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3586
3587 if not self.params.get('overwrites', True) and os.path.exists(thumb_filename):
3588 ret.append((thumb_filename, thumb_filename_final))
3589 t['filepath'] = thumb_filename
3590 self.to_screen('[info] %s is already present' % (
3591 thumb_display_id if multiple else f'{label} thumbnail').capitalize())
3592 else:
3593 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3594 try:
3595 uf = self.urlopen(t['url'])
3596 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3597 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3598 shutil.copyfileobj(uf, thumbf)
3599 ret.append((thumb_filename, thumb_filename_final))
3600 t['filepath'] = thumb_filename
3601 except network_exceptions as err:
3602 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3603 if ret and not write_all:
3604 break
3605 return ret