]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
[extractor] Framework for embed detection (#4307)
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 import collections
2 import contextlib
3 import datetime
4 import errno
5 import fileinput
6 import functools
7 import io
8 import itertools
9 import json
10 import locale
11 import operator
12 import os
13 import random
14 import re
15 import shutil
16 import subprocess
17 import sys
18 import tempfile
19 import time
20 import tokenize
21 import traceback
22 import unicodedata
23 import urllib.request
24 from string import ascii_letters
25
26 from .cache import Cache
27 from .compat import compat_os_name, compat_shlex_quote
28 from .cookies import load_cookies
29 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
30 from .downloader.rtmp import rtmpdump_version
31 from .extractor import gen_extractor_classes, get_info_extractor
32 from .extractor.openload import PhantomJSwrapper
33 from .minicurses import format_text
34 from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
35 from .postprocessor import (
36 EmbedThumbnailPP,
37 FFmpegFixupDuplicateMoovPP,
38 FFmpegFixupDurationPP,
39 FFmpegFixupM3u8PP,
40 FFmpegFixupM4aPP,
41 FFmpegFixupStretchedPP,
42 FFmpegFixupTimestampPP,
43 FFmpegMergerPP,
44 FFmpegPostProcessor,
45 FFmpegVideoConvertorPP,
46 MoveFilesAfterDownloadPP,
47 get_postprocessor,
48 )
49 from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping
50 from .update import detect_variant
51 from .utils import (
52 DEFAULT_OUTTMPL,
53 IDENTITY,
54 LINK_TEMPLATES,
55 MEDIA_EXTENSIONS,
56 NO_DEFAULT,
57 NUMBER_RE,
58 OUTTMPL_TYPES,
59 POSTPROCESS_WHEN,
60 STR_FORMAT_RE_TMPL,
61 STR_FORMAT_TYPES,
62 ContentTooShortError,
63 DateRange,
64 DownloadCancelled,
65 DownloadError,
66 EntryNotInPlaylist,
67 ExistingVideoReached,
68 ExtractorError,
69 GeoRestrictedError,
70 HEADRequest,
71 ISO3166Utils,
72 LazyList,
73 MaxDownloadsReached,
74 Namespace,
75 PagedList,
76 PerRequestProxyHandler,
77 PlaylistEntries,
78 Popen,
79 PostProcessingError,
80 ReExtractInfo,
81 RejectedVideoReached,
82 SameFileError,
83 UnavailableVideoError,
84 UserNotLive,
85 YoutubeDLCookieProcessor,
86 YoutubeDLHandler,
87 YoutubeDLRedirectHandler,
88 age_restricted,
89 args_to_str,
90 bug_reports_message,
91 date_from_str,
92 determine_ext,
93 determine_protocol,
94 encode_compat_str,
95 encodeFilename,
96 error_to_compat_str,
97 escapeHTML,
98 expand_path,
99 filter_dict,
100 float_or_none,
101 format_bytes,
102 format_decimal_suffix,
103 format_field,
104 formatSeconds,
105 get_domain,
106 int_or_none,
107 iri_to_uri,
108 join_nonempty,
109 locked_file,
110 make_dir,
111 make_HTTPS_handler,
112 merge_headers,
113 network_exceptions,
114 number_of_digits,
115 orderedSet,
116 parse_filesize,
117 preferredencoding,
118 prepend_extension,
119 register_socks_protocols,
120 remove_terminal_sequences,
121 render_table,
122 replace_extension,
123 sanitize_filename,
124 sanitize_path,
125 sanitize_url,
126 sanitized_Request,
127 std_headers,
128 str_or_none,
129 strftime_or_none,
130 subtitles_filename,
131 supports_terminal_sequences,
132 system_identifier,
133 timetuple_from_msec,
134 to_high_limit_path,
135 traverse_obj,
136 try_get,
137 url_basename,
138 variadic,
139 version_tuple,
140 windows_enable_vt_mode,
141 write_json_file,
142 write_string,
143 )
144 from .version import RELEASE_GIT_HEAD, __version__
145
146 if compat_os_name == 'nt':
147 import ctypes
148
149
150 class YoutubeDL:
151 """YoutubeDL class.
152
153 YoutubeDL objects are the ones responsible of downloading the
154 actual video file and writing it to disk if the user has requested
155 it, among some other tasks. In most cases there should be one per
156 program. As, given a video URL, the downloader doesn't know how to
157 extract all the needed information, task that InfoExtractors do, it
158 has to pass the URL to one of them.
159
160 For this, YoutubeDL objects have a method that allows
161 InfoExtractors to be registered in a given order. When it is passed
162 a URL, the YoutubeDL object handles it to the first InfoExtractor it
163 finds that reports being able to handle it. The InfoExtractor extracts
164 all the information about the video or videos the URL refers to, and
165 YoutubeDL process the extracted information, possibly using a File
166 Downloader to download the video.
167
168 YoutubeDL objects accept a lot of parameters. In order not to saturate
169 the object constructor with arguments, it receives a dictionary of
170 options instead. These options are available through the params
171 attribute for the InfoExtractors to use. The YoutubeDL also
172 registers itself as the downloader in charge for the InfoExtractors
173 that are added to it, so this is a "mutual registration".
174
175 Available options:
176
177 username: Username for authentication purposes.
178 password: Password for authentication purposes.
179 videopassword: Password for accessing a video.
180 ap_mso: Adobe Pass multiple-system operator identifier.
181 ap_username: Multiple-system operator account username.
182 ap_password: Multiple-system operator account password.
183 usenetrc: Use netrc for authentication instead.
184 verbose: Print additional info to stdout.
185 quiet: Do not print messages to stdout.
186 no_warnings: Do not print out anything for warnings.
187 forceprint: A dict with keys WHEN mapped to a list of templates to
188 print to stdout. The allowed keys are video or any of the
189 items in utils.POSTPROCESS_WHEN.
190 For compatibility, a single list is also accepted
191 print_to_file: A dict with keys WHEN (same as forceprint) mapped to
192 a list of tuples with (template, filename)
193 forcejson: Force printing info_dict as JSON.
194 dump_single_json: Force printing the info_dict of the whole playlist
195 (or video) as a single JSON line.
196 force_write_download_archive: Force writing download archive regardless
197 of 'skip_download' or 'simulate'.
198 simulate: Do not download the video files. If unset (or None),
199 simulate only if listsubtitles, listformats or list_thumbnails is used
200 format: Video format code. see "FORMAT SELECTION" for more details.
201 You can also pass a function. The function takes 'ctx' as
202 argument and returns the formats to download.
203 See "build_format_selector" for an implementation
204 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
205 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
206 extracting metadata even if the video is not actually
207 available for download (experimental)
208 format_sort: A list of fields by which to sort the video formats.
209 See "Sorting Formats" for more details.
210 format_sort_force: Force the given format_sort. see "Sorting Formats"
211 for more details.
212 prefer_free_formats: Whether to prefer video formats with free containers
213 over non-free ones of same quality.
214 allow_multiple_video_streams: Allow multiple video streams to be merged
215 into a single file
216 allow_multiple_audio_streams: Allow multiple audio streams to be merged
217 into a single file
218 check_formats Whether to test if the formats are downloadable.
219 Can be True (check all), False (check none),
220 'selected' (check selected formats),
221 or None (check only if requested by extractor)
222 paths: Dictionary of output paths. The allowed keys are 'home'
223 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
224 outtmpl: Dictionary of templates for output names. Allowed keys
225 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
226 For compatibility with youtube-dl, a single string can also be used
227 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
228 restrictfilenames: Do not allow "&" and spaces in file names
229 trim_file_name: Limit length of filename (extension excluded)
230 windowsfilenames: Force the filenames to be windows compatible
231 ignoreerrors: Do not stop on download/postprocessing errors.
232 Can be 'only_download' to ignore only download errors.
233 Default is 'only_download' for CLI, but False for API
234 skip_playlist_after_errors: Number of allowed failures until the rest of
235 the playlist is skipped
236 force_generic_extractor: Force downloader to use the generic extractor
237 overwrites: Overwrite all video and metadata files if True,
238 overwrite only non-video files if None
239 and don't overwrite any file if False
240 For compatibility with youtube-dl,
241 "nooverwrites" may also be used instead
242 playlist_items: Specific indices of playlist to download.
243 playlistrandom: Download playlist items in random order.
244 lazy_playlist: Process playlist entries as they are received.
245 matchtitle: Download only matching titles.
246 rejecttitle: Reject downloads for matching titles.
247 logger: Log messages to a logging.Logger instance.
248 logtostderr: Log messages to stderr instead of stdout.
249 consoletitle: Display progress in console window's titlebar.
250 writedescription: Write the video description to a .description file
251 writeinfojson: Write the video description to a .info.json file
252 clean_infojson: Remove private fields from the infojson
253 getcomments: Extract video comments. This will not be written to disk
254 unless writeinfojson is also given
255 writeannotations: Write the video annotations to a .annotations.xml file
256 writethumbnail: Write the thumbnail image to a file
257 allow_playlist_files: Whether to write playlists' description, infojson etc
258 also to disk when using the 'write*' options
259 write_all_thumbnails: Write all thumbnail formats to files
260 writelink: Write an internet shortcut file, depending on the
261 current platform (.url/.webloc/.desktop)
262 writeurllink: Write a Windows internet shortcut file (.url)
263 writewebloclink: Write a macOS internet shortcut file (.webloc)
264 writedesktoplink: Write a Linux internet shortcut file (.desktop)
265 writesubtitles: Write the video subtitles to a file
266 writeautomaticsub: Write the automatically generated subtitles to a file
267 listsubtitles: Lists all available subtitles for the video
268 subtitlesformat: The format code for subtitles
269 subtitleslangs: List of languages of the subtitles to download (can be regex).
270 The list may contain "all" to refer to all the available
271 subtitles. The language can be prefixed with a "-" to
272 exclude it from the requested languages. Eg: ['all', '-live_chat']
273 keepvideo: Keep the video file after post-processing
274 daterange: A DateRange object, download only if the upload_date is in the range.
275 skip_download: Skip the actual download of the video file
276 cachedir: Location of the cache files in the filesystem.
277 False to disable filesystem cache.
278 noplaylist: Download single video instead of a playlist if in doubt.
279 age_limit: An integer representing the user's age in years.
280 Unsuitable videos for the given age are skipped.
281 min_views: An integer representing the minimum view count the video
282 must have in order to not be skipped.
283 Videos without view count information are always
284 downloaded. None for no limit.
285 max_views: An integer representing the maximum view count.
286 Videos that are more popular than that are not
287 downloaded.
288 Videos without view count information are always
289 downloaded. None for no limit.
290 download_archive: File name of a file where all downloads are recorded.
291 Videos already present in the file are not downloaded
292 again.
293 break_on_existing: Stop the download process after attempting to download a
294 file that is in the archive.
295 break_on_reject: Stop the download process when encountering a video that
296 has been filtered out.
297 break_per_url: Whether break_on_reject and break_on_existing
298 should act on each input URL as opposed to for the entire queue
299 cookiefile: File name or text stream from where cookies should be read and dumped to
300 cookiesfrombrowser: A tuple containing the name of the browser, the profile
301 name/pathfrom where cookies are loaded, and the name of the
302 keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
303 legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
304 support RFC 5746 secure renegotiation
305 nocheckcertificate: Do not verify SSL certificates
306 client_certificate: Path to client certificate file in PEM format. May include the private key
307 client_certificate_key: Path to private key file for client certificate
308 client_certificate_password: Password for client certificate private key, if encrypted.
309 If not provided and the key is encrypted, yt-dlp will ask interactively
310 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
311 (Only supported by some extractors)
312 http_headers: A dictionary of custom headers to be used for all requests
313 proxy: URL of the proxy server to use
314 geo_verification_proxy: URL of the proxy to use for IP address verification
315 on geo-restricted sites.
316 socket_timeout: Time to wait for unresponsive hosts, in seconds
317 bidi_workaround: Work around buggy terminals without bidirectional text
318 support, using fridibi
319 debug_printtraffic:Print out sent and received HTTP traffic
320 default_search: Prepend this string if an input url is not valid.
321 'auto' for elaborate guessing
322 encoding: Use this encoding instead of the system-specified.
323 extract_flat: Whether to resolve and process url_results further
324 * False: Always process (default)
325 * True: Never process
326 * 'in_playlist': Do not process inside playlist/multi_video
327 * 'discard': Always process, but don't return the result
328 from inside playlist/multi_video
329 * 'discard_in_playlist': Same as "discard", but only for
330 playlists (not multi_video)
331 wait_for_video: If given, wait for scheduled streams to become available.
332 The value should be a tuple containing the range
333 (min_secs, max_secs) to wait between retries
334 postprocessors: A list of dictionaries, each with an entry
335 * key: The name of the postprocessor. See
336 yt_dlp/postprocessor/__init__.py for a list.
337 * when: When to run the postprocessor. Allowed values are
338 the entries of utils.POSTPROCESS_WHEN
339 Assumed to be 'post_process' if not given
340 progress_hooks: A list of functions that get called on download
341 progress, with a dictionary with the entries
342 * status: One of "downloading", "error", or "finished".
343 Check this first and ignore unknown values.
344 * info_dict: The extracted info_dict
345
346 If status is one of "downloading", or "finished", the
347 following properties may also be present:
348 * filename: The final filename (always present)
349 * tmpfilename: The filename we're currently writing to
350 * downloaded_bytes: Bytes on disk
351 * total_bytes: Size of the whole file, None if unknown
352 * total_bytes_estimate: Guess of the eventual file size,
353 None if unavailable.
354 * elapsed: The number of seconds since download started.
355 * eta: The estimated time in seconds, None if unknown
356 * speed: The download speed in bytes/second, None if
357 unknown
358 * fragment_index: The counter of the currently
359 downloaded video fragment.
360 * fragment_count: The number of fragments (= individual
361 files that will be merged)
362
363 Progress hooks are guaranteed to be called at least once
364 (with status "finished") if the download is successful.
365 postprocessor_hooks: A list of functions that get called on postprocessing
366 progress, with a dictionary with the entries
367 * status: One of "started", "processing", or "finished".
368 Check this first and ignore unknown values.
369 * postprocessor: Name of the postprocessor
370 * info_dict: The extracted info_dict
371
372 Progress hooks are guaranteed to be called at least twice
373 (with status "started" and "finished") if the processing is successful.
374 merge_output_format: Extension to use when merging formats.
375 final_ext: Expected final extension; used to detect when the file was
376 already downloaded and converted
377 fixup: Automatically correct known faults of the file.
378 One of:
379 - "never": do nothing
380 - "warn": only emit a warning
381 - "detect_or_warn": check whether we can do anything
382 about it, warn otherwise (default)
383 source_address: Client-side IP address to bind to.
384 sleep_interval_requests: Number of seconds to sleep between requests
385 during extraction
386 sleep_interval: Number of seconds to sleep before each download when
387 used alone or a lower bound of a range for randomized
388 sleep before each download (minimum possible number
389 of seconds to sleep) when used along with
390 max_sleep_interval.
391 max_sleep_interval:Upper bound of a range for randomized sleep before each
392 download (maximum possible number of seconds to sleep).
393 Must only be used along with sleep_interval.
394 Actual sleep time will be a random float from range
395 [sleep_interval; max_sleep_interval].
396 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
397 listformats: Print an overview of available video formats and exit.
398 list_thumbnails: Print a table of all thumbnails and exit.
399 match_filter: A function that gets called for every video with the signature
400 (info_dict, *, incomplete: bool) -> Optional[str]
401 For backward compatibility with youtube-dl, the signature
402 (info_dict) -> Optional[str] is also allowed.
403 - If it returns a message, the video is ignored.
404 - If it returns None, the video is downloaded.
405 - If it returns utils.NO_DEFAULT, the user is interactively
406 asked whether to download the video.
407 match_filter_func in utils.py is one example for this.
408 no_color: Do not emit color codes in output.
409 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
410 HTTP header
411 geo_bypass_country:
412 Two-letter ISO 3166-2 country code that will be used for
413 explicit geographic restriction bypassing via faking
414 X-Forwarded-For HTTP header
415 geo_bypass_ip_block:
416 IP range in CIDR notation that will be used similarly to
417 geo_bypass_country
418 external_downloader: A dictionary of protocol keys and the executable of the
419 external downloader to use for it. The allowed protocols
420 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
421 Set the value to 'native' to use the native downloader
422 compat_opts: Compatibility options. See "Differences in default behavior".
423 The following options do not work when used through the API:
424 filename, abort-on-error, multistreams, no-live-chat, format-sort
425 no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json.
426 Refer __init__.py for their implementation
427 progress_template: Dictionary of templates for progress outputs.
428 Allowed keys are 'download', 'postprocess',
429 'download-title' (console title) and 'postprocess-title'.
430 The template is mapped on a dictionary with keys 'progress' and 'info'
431 retry_sleep_functions: Dictionary of functions that takes the number of attempts
432 as argument and returns the time to sleep in seconds.
433 Allowed keys are 'http', 'fragment', 'file_access'
434 download_ranges: A callback function that gets called for every video with
435 the signature (info_dict, ydl) -> Iterable[Section].
436 Only the returned sections will be downloaded.
437 Each Section is a dict with the following keys:
438 * start_time: Start time of the section in seconds
439 * end_time: End time of the section in seconds
440 * title: Section title (Optional)
441 * index: Section number (Optional)
442 force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts
443 noprogress: Do not print the progress bar
444
445 The following parameters are not used by YoutubeDL itself, they are used by
446 the downloader (see yt_dlp/downloader/common.py):
447 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
448 max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
449 continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
450 external_downloader_args, concurrent_fragment_downloads.
451
452 The following options are used by the post processors:
453 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
454 to the binary or its containing directory.
455 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
456 and a list of additional command-line arguments for the
457 postprocessor/executable. The dict can also have "PP+EXE" keys
458 which are used when the given exe is used by the given PP.
459 Use 'default' as the name for arguments to passed to all PP
460 For compatibility with youtube-dl, a single list of args
461 can also be used
462
463 The following options are used by the extractors:
464 extractor_retries: Number of times to retry for known errors
465 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
466 hls_split_discontinuity: Split HLS playlists to different formats at
467 discontinuities such as ad breaks (default: False)
468 extractor_args: A dictionary of arguments to be passed to the extractors.
469 See "EXTRACTOR ARGUMENTS" for details.
470 Eg: {'youtube': {'skip': ['dash', 'hls']}}
471 mark_watched: Mark videos watched (even with --simulate). Only for YouTube
472
473 The following options are deprecated and may be removed in the future:
474
475 playliststart: - Use playlist_items
476 Playlist item to start at.
477 playlistend: - Use playlist_items
478 Playlist item to end at.
479 playlistreverse: - Use playlist_items
480 Download playlist items in reverse order.
481 forceurl: - Use forceprint
482 Force printing final URL.
483 forcetitle: - Use forceprint
484 Force printing title.
485 forceid: - Use forceprint
486 Force printing ID.
487 forcethumbnail: - Use forceprint
488 Force printing thumbnail URL.
489 forcedescription: - Use forceprint
490 Force printing description.
491 forcefilename: - Use forceprint
492 Force printing final filename.
493 forceduration: - Use forceprint
494 Force printing duration.
495 allsubtitles: - Use subtitleslangs = ['all']
496 Downloads all the subtitles of the video
497 (requires writesubtitles or writeautomaticsub)
498 include_ads: - Doesn't work
499 Download ads as well
500 call_home: - Not implemented
501 Boolean, true iff we are allowed to contact the
502 yt-dlp servers for debugging.
503 post_hooks: - Register a custom postprocessor
504 A list of functions that get called as the final step
505 for each video file, after all postprocessors have been
506 called. The filename will be passed as the only argument.
507 hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}.
508 Use the native HLS downloader instead of ffmpeg/avconv
509 if True, otherwise use ffmpeg/avconv if False, otherwise
510 use downloader suggested by extractor if None.
511 prefer_ffmpeg: - avconv support is deprecated
512 If False, use avconv instead of ffmpeg if both are available,
513 otherwise prefer ffmpeg.
514 youtube_include_dash_manifest: - Use extractor_args
515 If True (default), DASH manifests and related
516 data will be downloaded and processed by extractor.
517 You can reduce network I/O by disabling it if you don't
518 care about DASH. (only for youtube)
519 youtube_include_hls_manifest: - Use extractor_args
520 If True (default), HLS manifests and related
521 data will be downloaded and processed by extractor.
522 You can reduce network I/O by disabling it if you don't
523 care about HLS. (only for youtube)
524 """
525
526 _NUMERIC_FIELDS = {
527 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
528 'timestamp', 'release_timestamp',
529 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
530 'average_rating', 'comment_count', 'age_limit',
531 'start_time', 'end_time',
532 'chapter_number', 'season_number', 'episode_number',
533 'track_number', 'disc_number', 'release_year',
534 }
535
536 _format_fields = {
537 # NB: Keep in sync with the docstring of extractor/common.py
538 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
539 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr',
540 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
541 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
542 'preference', 'language', 'language_preference', 'quality', 'source_preference',
543 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
544 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
545 }
546 _format_selection_exts = {
547 'audio': set(MEDIA_EXTENSIONS.common_audio),
548 'video': set(MEDIA_EXTENSIONS.common_video + ('3gp', )),
549 'storyboards': set(MEDIA_EXTENSIONS.storyboards),
550 }
551
552 def __init__(self, params=None, auto_init=True):
553 """Create a FileDownloader object with the given options.
554 @param auto_init Whether to load the default extractors and print header (if verbose).
555 Set to 'no_verbose_header' to not print the header
556 """
557 if params is None:
558 params = {}
559 self.params = params
560 self._ies = {}
561 self._ies_instances = {}
562 self._pps = {k: [] for k in POSTPROCESS_WHEN}
563 self._printed_messages = set()
564 self._first_webpage_request = True
565 self._post_hooks = []
566 self._progress_hooks = []
567 self._postprocessor_hooks = []
568 self._download_retcode = 0
569 self._num_downloads = 0
570 self._num_videos = 0
571 self._playlist_level = 0
572 self._playlist_urls = set()
573 self.cache = Cache(self)
574
575 windows_enable_vt_mode()
576 stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
577 self._out_files = Namespace(
578 out=stdout,
579 error=sys.stderr,
580 screen=sys.stderr if self.params.get('quiet') else stdout,
581 console=None if compat_os_name == 'nt' else next(
582 filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None)
583 )
584 self._allow_colors = Namespace(**{
585 type_: not self.params.get('no_color') and supports_terminal_sequences(stream)
586 for type_, stream in self._out_files.items_ if type_ != 'console'
587 })
588
589 # The code is left like this to be reused for future deprecations
590 MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 7)
591 current_version = sys.version_info[:2]
592 if current_version < MIN_RECOMMENDED:
593 msg = ('Support for Python version %d.%d has been deprecated. '
594 'See https://github.com/yt-dlp/yt-dlp/issues/3764 for more details.'
595 '\n You will no longer receive updates on this version')
596 if current_version < MIN_SUPPORTED:
597 msg = 'Python version %d.%d is no longer supported'
598 self.deprecation_warning(
599 f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED))
600
601 if self.params.get('allow_unplayable_formats'):
602 self.report_warning(
603 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
604 'This is a developer option intended for debugging. \n'
605 ' If you experience any issues while using this option, '
606 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
607
608 def check_deprecated(param, option, suggestion):
609 if self.params.get(param) is not None:
610 self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
611 return True
612 return False
613
614 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
615 if self.params.get('geo_verification_proxy') is None:
616 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
617
618 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
619 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
620 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
621
622 for msg in self.params.get('_warnings', []):
623 self.report_warning(msg)
624 for msg in self.params.get('_deprecation_warnings', []):
625 self.deprecation_warning(msg)
626
627 self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
628 if 'list-formats' in self.params['compat_opts']:
629 self.params['listformats_table'] = False
630
631 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
632 # nooverwrites was unnecessarily changed to overwrites
633 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
634 # This ensures compatibility with both keys
635 self.params['overwrites'] = not self.params['nooverwrites']
636 elif self.params.get('overwrites') is None:
637 self.params.pop('overwrites', None)
638 else:
639 self.params['nooverwrites'] = not self.params['overwrites']
640
641 self.params.setdefault('forceprint', {})
642 self.params.setdefault('print_to_file', {})
643
644 # Compatibility with older syntax
645 if not isinstance(params['forceprint'], dict):
646 self.params['forceprint'] = {'video': params['forceprint']}
647
648 if self.params.get('bidi_workaround', False):
649 try:
650 import pty
651 master, slave = pty.openpty()
652 width = shutil.get_terminal_size().columns
653 width_args = [] if width is None else ['-w', str(width)]
654 sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
655 try:
656 self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
657 except OSError:
658 self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
659 self._output_channel = os.fdopen(master, 'rb')
660 except OSError as ose:
661 if ose.errno == errno.ENOENT:
662 self.report_warning(
663 'Could not find fribidi executable, ignoring --bidi-workaround. '
664 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
665 else:
666 raise
667
668 if auto_init:
669 if auto_init != 'no_verbose_header':
670 self.print_debug_header()
671 self.add_default_info_extractors()
672
673 if (sys.platform != 'win32'
674 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
675 and not self.params.get('restrictfilenames', False)):
676 # Unicode filesystem API will throw errors (#1474, #13027)
677 self.report_warning(
678 'Assuming --restrict-filenames since file system encoding '
679 'cannot encode all characters. '
680 'Set the LC_ALL environment variable to fix this.')
681 self.params['restrictfilenames'] = True
682
683 self._parse_outtmpl()
684
685 # Creating format selector here allows us to catch syntax errors before the extraction
686 self.format_selector = (
687 self.params.get('format') if self.params.get('format') in (None, '-')
688 else self.params['format'] if callable(self.params['format'])
689 else self.build_format_selector(self.params['format']))
690
691 # Set http_headers defaults according to std_headers
692 self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
693
694 hooks = {
695 'post_hooks': self.add_post_hook,
696 'progress_hooks': self.add_progress_hook,
697 'postprocessor_hooks': self.add_postprocessor_hook,
698 }
699 for opt, fn in hooks.items():
700 for ph in self.params.get(opt, []):
701 fn(ph)
702
703 for pp_def_raw in self.params.get('postprocessors', []):
704 pp_def = dict(pp_def_raw)
705 when = pp_def.pop('when', 'post_process')
706 self.add_post_processor(
707 get_postprocessor(pp_def.pop('key'))(self, **pp_def),
708 when=when)
709
710 self._setup_opener()
711 register_socks_protocols()
712
713 def preload_download_archive(fn):
714 """Preload the archive, if any is specified"""
715 if fn is None:
716 return False
717 self.write_debug(f'Loading archive file {fn!r}')
718 try:
719 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
720 for line in archive_file:
721 self.archive.add(line.strip())
722 except OSError as ioe:
723 if ioe.errno != errno.ENOENT:
724 raise
725 return False
726 return True
727
728 self.archive = set()
729 preload_download_archive(self.params.get('download_archive'))
730
731 def warn_if_short_id(self, argv):
732 # short YouTube ID starting with dash?
733 idxs = [
734 i for i, a in enumerate(argv)
735 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
736 if idxs:
737 correct_argv = (
738 ['yt-dlp']
739 + [a for i, a in enumerate(argv) if i not in idxs]
740 + ['--'] + [argv[i] for i in idxs]
741 )
742 self.report_warning(
743 'Long argument string detected. '
744 'Use -- to separate parameters and URLs, like this:\n%s' %
745 args_to_str(correct_argv))
746
747 def add_info_extractor(self, ie):
748 """Add an InfoExtractor object to the end of the list."""
749 ie_key = ie.ie_key()
750 self._ies[ie_key] = ie
751 if not isinstance(ie, type):
752 self._ies_instances[ie_key] = ie
753 ie.set_downloader(self)
754
755 def _get_info_extractor_class(self, ie_key):
756 ie = self._ies.get(ie_key)
757 if ie is None:
758 ie = get_info_extractor(ie_key)
759 self.add_info_extractor(ie)
760 return ie
761
762 def get_info_extractor(self, ie_key):
763 """
764 Get an instance of an IE with name ie_key, it will try to get one from
765 the _ies list, if there's no instance it will create a new one and add
766 it to the extractor list.
767 """
768 ie = self._ies_instances.get(ie_key)
769 if ie is None:
770 ie = get_info_extractor(ie_key)()
771 self.add_info_extractor(ie)
772 return ie
773
774 def add_default_info_extractors(self):
775 """
776 Add the InfoExtractors returned by gen_extractors to the end of the list
777 """
778 for ie in gen_extractor_classes():
779 self.add_info_extractor(ie)
780
781 def add_post_processor(self, pp, when='post_process'):
782 """Add a PostProcessor object to the end of the chain."""
783 assert when in POSTPROCESS_WHEN, f'Invalid when={when}'
784 self._pps[when].append(pp)
785 pp.set_downloader(self)
786
787 def add_post_hook(self, ph):
788 """Add the post hook"""
789 self._post_hooks.append(ph)
790
791 def add_progress_hook(self, ph):
792 """Add the download progress hook"""
793 self._progress_hooks.append(ph)
794
795 def add_postprocessor_hook(self, ph):
796 """Add the postprocessing progress hook"""
797 self._postprocessor_hooks.append(ph)
798 for pps in self._pps.values():
799 for pp in pps:
800 pp.add_progress_hook(ph)
801
802 def _bidi_workaround(self, message):
803 if not hasattr(self, '_output_channel'):
804 return message
805
806 assert hasattr(self, '_output_process')
807 assert isinstance(message, str)
808 line_count = message.count('\n') + 1
809 self._output_process.stdin.write((message + '\n').encode())
810 self._output_process.stdin.flush()
811 res = ''.join(self._output_channel.readline().decode()
812 for _ in range(line_count))
813 return res[:-len('\n')]
814
815 def _write_string(self, message, out=None, only_once=False):
816 if only_once:
817 if message in self._printed_messages:
818 return
819 self._printed_messages.add(message)
820 write_string(message, out=out, encoding=self.params.get('encoding'))
821
822 def to_stdout(self, message, skip_eol=False, quiet=None):
823 """Print message to stdout"""
824 if quiet is not None:
825 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead')
826 if skip_eol is not False:
827 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. Use "YoutubeDL.to_screen" instead')
828 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out)
829
830 def to_screen(self, message, skip_eol=False, quiet=None):
831 """Print message to screen if not in quiet mode"""
832 if self.params.get('logger'):
833 self.params['logger'].debug(message)
834 return
835 if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'):
836 return
837 self._write_string(
838 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
839 self._out_files.screen)
840
841 def to_stderr(self, message, only_once=False):
842 """Print message to stderr"""
843 assert isinstance(message, str)
844 if self.params.get('logger'):
845 self.params['logger'].error(message)
846 else:
847 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once)
848
849 def _send_console_code(self, code):
850 if compat_os_name == 'nt' or not self._out_files.console:
851 return
852 self._write_string(code, self._out_files.console)
853
854 def to_console_title(self, message):
855 if not self.params.get('consoletitle', False):
856 return
857 message = remove_terminal_sequences(message)
858 if compat_os_name == 'nt':
859 if ctypes.windll.kernel32.GetConsoleWindow():
860 # c_wchar_p() might not be necessary if `message` is
861 # already of type unicode()
862 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
863 else:
864 self._send_console_code(f'\033]0;{message}\007')
865
866 def save_console_title(self):
867 if not self.params.get('consoletitle') or self.params.get('simulate'):
868 return
869 self._send_console_code('\033[22;0t') # Save the title on stack
870
871 def restore_console_title(self):
872 if not self.params.get('consoletitle') or self.params.get('simulate'):
873 return
874 self._send_console_code('\033[23;0t') # Restore the title from stack
875
876 def __enter__(self):
877 self.save_console_title()
878 return self
879
880 def __exit__(self, *args):
881 self.restore_console_title()
882
883 if self.params.get('cookiefile') is not None:
884 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
885
886 def trouble(self, message=None, tb=None, is_error=True):
887 """Determine action to take when a download problem appears.
888
889 Depending on if the downloader has been configured to ignore
890 download errors or not, this method may throw an exception or
891 not when errors are found, after printing the message.
892
893 @param tb If given, is additional traceback information
894 @param is_error Whether to raise error according to ignorerrors
895 """
896 if message is not None:
897 self.to_stderr(message)
898 if self.params.get('verbose'):
899 if tb is None:
900 if sys.exc_info()[0]: # if .trouble has been called from an except block
901 tb = ''
902 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
903 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
904 tb += encode_compat_str(traceback.format_exc())
905 else:
906 tb_data = traceback.format_list(traceback.extract_stack())
907 tb = ''.join(tb_data)
908 if tb:
909 self.to_stderr(tb)
910 if not is_error:
911 return
912 if not self.params.get('ignoreerrors'):
913 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
914 exc_info = sys.exc_info()[1].exc_info
915 else:
916 exc_info = sys.exc_info()
917 raise DownloadError(message, exc_info)
918 self._download_retcode = 1
919
920 Styles = Namespace(
921 HEADERS='yellow',
922 EMPHASIS='light blue',
923 FILENAME='green',
924 ID='green',
925 DELIM='blue',
926 ERROR='red',
927 WARNING='yellow',
928 SUPPRESS='light black',
929 )
930
931 def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
932 text = str(text)
933 if test_encoding:
934 original_text = text
935 # handle.encoding can be None. See https://github.com/yt-dlp/yt-dlp/issues/2711
936 encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii'
937 text = text.encode(encoding, 'ignore').decode(encoding)
938 if fallback is not None and text != original_text:
939 text = fallback
940 return format_text(text, f) if allow_colors else text if fallback is None else fallback
941
942 def _format_out(self, *args, **kwargs):
943 return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs)
944
945 def _format_screen(self, *args, **kwargs):
946 return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs)
947
948 def _format_err(self, *args, **kwargs):
949 return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs)
950
951 def report_warning(self, message, only_once=False):
952 '''
953 Print the message to stderr, it will be prefixed with 'WARNING:'
954 If stderr is a tty file the 'WARNING:' will be colored
955 '''
956 if self.params.get('logger') is not None:
957 self.params['logger'].warning(message)
958 else:
959 if self.params.get('no_warnings'):
960 return
961 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
962
963 def deprecation_warning(self, message):
964 if self.params.get('logger') is not None:
965 self.params['logger'].warning(f'DeprecationWarning: {message}')
966 else:
967 self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True)
968
969 def report_error(self, message, *args, **kwargs):
970 '''
971 Do the same as trouble, but prefixes the message with 'ERROR:', colored
972 in red if stderr is a tty file.
973 '''
974 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
975
976 def write_debug(self, message, only_once=False):
977 '''Log debug message or Print message to stderr'''
978 if not self.params.get('verbose', False):
979 return
980 message = f'[debug] {message}'
981 if self.params.get('logger'):
982 self.params['logger'].debug(message)
983 else:
984 self.to_stderr(message, only_once)
985
986 def report_file_already_downloaded(self, file_name):
987 """Report file has already been fully downloaded."""
988 try:
989 self.to_screen('[download] %s has already been downloaded' % file_name)
990 except UnicodeEncodeError:
991 self.to_screen('[download] The file has already been downloaded')
992
993 def report_file_delete(self, file_name):
994 """Report that existing file will be deleted."""
995 try:
996 self.to_screen('Deleting existing file %s' % file_name)
997 except UnicodeEncodeError:
998 self.to_screen('Deleting existing file')
999
1000 def raise_no_formats(self, info, forced=False, *, msg=None):
1001 has_drm = info.get('_has_drm')
1002 ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
1003 msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!'
1004 if forced or not ignored:
1005 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
1006 expected=has_drm or ignored or expected)
1007 else:
1008 self.report_warning(msg)
1009
1010 def parse_outtmpl(self):
1011 self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version')
1012 self._parse_outtmpl()
1013 return self.params['outtmpl']
1014
1015 def _parse_outtmpl(self):
1016 sanitize = IDENTITY
1017 if self.params.get('restrictfilenames'): # Remove spaces in the default template
1018 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
1019
1020 outtmpl = self.params.setdefault('outtmpl', {})
1021 if not isinstance(outtmpl, dict):
1022 self.params['outtmpl'] = outtmpl = {'default': outtmpl}
1023 outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None})
1024
1025 def get_output_path(self, dir_type='', filename=None):
1026 paths = self.params.get('paths', {})
1027 assert isinstance(paths, dict)
1028 path = os.path.join(
1029 expand_path(paths.get('home', '').strip()),
1030 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
1031 filename or '')
1032 return sanitize_path(path, force=self.params.get('windowsfilenames'))
1033
1034 @staticmethod
1035 def _outtmpl_expandpath(outtmpl):
1036 # expand_path translates '%%' into '%' and '$$' into '$'
1037 # correspondingly that is not what we want since we need to keep
1038 # '%%' intact for template dict substitution step. Working around
1039 # with boundary-alike separator hack.
1040 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
1041 outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$')
1042
1043 # outtmpl should be expand_path'ed before template dict substitution
1044 # because meta fields may contain env variables we don't want to
1045 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
1046 # title "Hello $PATH", we don't want `$PATH` to be expanded.
1047 return expand_path(outtmpl).replace(sep, '')
1048
1049 @staticmethod
1050 def escape_outtmpl(outtmpl):
1051 ''' Escape any remaining strings like %s, %abc% etc. '''
1052 return re.sub(
1053 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
1054 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
1055 outtmpl)
1056
1057 @classmethod
1058 def validate_outtmpl(cls, outtmpl):
1059 ''' @return None or Exception object '''
1060 outtmpl = re.sub(
1061 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'),
1062 lambda mobj: f'{mobj.group(0)[:-1]}s',
1063 cls._outtmpl_expandpath(outtmpl))
1064 try:
1065 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
1066 return None
1067 except ValueError as err:
1068 return err
1069
1070 @staticmethod
1071 def _copy_infodict(info_dict):
1072 info_dict = dict(info_dict)
1073 info_dict.pop('__postprocessors', None)
1074 info_dict.pop('__pending_error', None)
1075 return info_dict
1076
1077 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
1078 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
1079 @param sanitize Whether to sanitize the output as a filename.
1080 For backward compatibility, a function can also be passed
1081 """
1082
1083 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1084
1085 info_dict = self._copy_infodict(info_dict)
1086 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1087 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1088 if info_dict.get('duration', None) is not None
1089 else None)
1090 info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads)
1091 info_dict['video_autonumber'] = self._num_videos
1092 if info_dict.get('resolution') is None:
1093 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1094
1095 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1096 # of %(field)s to %(field)0Nd for backward compatibility
1097 field_size_compat_map = {
1098 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0),
1099 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1100 'autonumber': self.params.get('autonumber_size') or 5,
1101 }
1102
1103 TMPL_DICT = {}
1104 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]'))
1105 MATH_FUNCTIONS = {
1106 '+': float.__add__,
1107 '-': float.__sub__,
1108 }
1109 # Field is of the form key1.key2...
1110 # where keys (except first) can be string, int or slice
1111 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
1112 MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
1113 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
1114 INTERNAL_FORMAT_RE = re.compile(rf'''(?x)
1115 (?P<negate>-)?
1116 (?P<fields>{FIELD_RE})
1117 (?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
1118 (?:>(?P<strf_format>.+?))?
1119 (?P<remaining>
1120 (?P<alternate>(?<!\\),[^|&)]+)?
1121 (?:&(?P<replacement>.*?))?
1122 (?:\|(?P<default>.*?))?
1123 )$''')
1124
1125 def _traverse_infodict(k):
1126 k = k.split('.')
1127 if k[0] == '':
1128 k.pop(0)
1129 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
1130
1131 def get_value(mdict):
1132 # Object traversal
1133 value = _traverse_infodict(mdict['fields'])
1134 # Negative
1135 if mdict['negate']:
1136 value = float_or_none(value)
1137 if value is not None:
1138 value *= -1
1139 # Do maths
1140 offset_key = mdict['maths']
1141 if offset_key:
1142 value = float_or_none(value)
1143 operator = None
1144 while offset_key:
1145 item = re.match(
1146 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1147 offset_key).group(0)
1148 offset_key = offset_key[len(item):]
1149 if operator is None:
1150 operator = MATH_FUNCTIONS[item]
1151 continue
1152 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1153 offset = float_or_none(item)
1154 if offset is None:
1155 offset = float_or_none(_traverse_infodict(item))
1156 try:
1157 value = operator(value, multiplier * offset)
1158 except (TypeError, ZeroDivisionError):
1159 return None
1160 operator = None
1161 # Datetime formatting
1162 if mdict['strf_format']:
1163 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1164
1165 # XXX: Workaround for https://github.com/yt-dlp/yt-dlp/issues/4485
1166 if sanitize and value == '':
1167 value = None
1168 return value
1169
1170 na = self.params.get('outtmpl_na_placeholder', 'NA')
1171
1172 def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
1173 return sanitize_filename(str(value), restricted=restricted, is_id=(
1174 bool(re.search(r'(^|[_.])id(\.|$)', key))
1175 if 'filename-sanitization' in self.params['compat_opts']
1176 else NO_DEFAULT))
1177
1178 sanitizer = sanitize if callable(sanitize) else filename_sanitizer
1179 sanitize = bool(sanitize)
1180
1181 def _dumpjson_default(obj):
1182 if isinstance(obj, (set, LazyList)):
1183 return list(obj)
1184 return repr(obj)
1185
1186 def create_key(outer_mobj):
1187 if not outer_mobj.group('has_key'):
1188 return outer_mobj.group(0)
1189 key = outer_mobj.group('key')
1190 mobj = re.match(INTERNAL_FORMAT_RE, key)
1191 initial_field = mobj.group('fields') if mobj else ''
1192 value, replacement, default = None, None, na
1193 while mobj:
1194 mobj = mobj.groupdict()
1195 default = mobj['default'] if mobj['default'] is not None else default
1196 value = get_value(mobj)
1197 replacement = mobj['replacement']
1198 if value is None and mobj['alternate']:
1199 mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
1200 else:
1201 break
1202
1203 fmt = outer_mobj.group('format')
1204 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1205 fmt = f'0{field_size_compat_map[key]:d}d'
1206
1207 value = default if value is None else value if replacement is None else replacement
1208
1209 flags = outer_mobj.group('conversion') or ''
1210 str_fmt = f'{fmt[:-1]}s'
1211 if fmt[-1] == 'l': # list
1212 delim = '\n' if '#' in flags else ', '
1213 value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
1214 elif fmt[-1] == 'j': # json
1215 value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
1216 elif fmt[-1] == 'h': # html
1217 value, fmt = escapeHTML(value), str_fmt
1218 elif fmt[-1] == 'q': # quoted
1219 value = map(str, variadic(value) if '#' in flags else [value])
1220 value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
1221 elif fmt[-1] == 'B': # bytes
1222 value = f'%{str_fmt}'.encode() % str(value).encode()
1223 value, fmt = value.decode('utf-8', 'ignore'), 's'
1224 elif fmt[-1] == 'U': # unicode normalized
1225 value, fmt = unicodedata.normalize(
1226 # "+" = compatibility equivalence, "#" = NFD
1227 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1228 value), str_fmt
1229 elif fmt[-1] == 'D': # decimal suffix
1230 num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
1231 value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
1232 factor=1024 if '#' in flags else 1000)
1233 elif fmt[-1] == 'S': # filename sanitization
1234 value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
1235 elif fmt[-1] == 'c':
1236 if value:
1237 value = str(value)[0]
1238 else:
1239 fmt = str_fmt
1240 elif fmt[-1] not in 'rs': # numeric
1241 value = float_or_none(value)
1242 if value is None:
1243 value, fmt = default, 's'
1244
1245 if sanitize:
1246 if fmt[-1] == 'r':
1247 # If value is an object, sanitize might convert it to a string
1248 # So we convert it to repr first
1249 value, fmt = repr(value), str_fmt
1250 if fmt[-1] in 'csr':
1251 value = sanitizer(initial_field, value)
1252
1253 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1254 TMPL_DICT[key] = value
1255 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1256
1257 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1258
1259 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1260 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1261 return self.escape_outtmpl(outtmpl) % info_dict
1262
1263 def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
1264 assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
1265 if outtmpl is None:
1266 outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default'])
1267 try:
1268 outtmpl = self._outtmpl_expandpath(outtmpl)
1269 filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
1270 if not filename:
1271 return None
1272
1273 if tmpl_type in ('', 'temp'):
1274 final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
1275 if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
1276 filename = replace_extension(filename, ext, final_ext)
1277 elif tmpl_type:
1278 force_ext = OUTTMPL_TYPES[tmpl_type]
1279 if force_ext:
1280 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1281
1282 # https://github.com/blackjack4494/youtube-dlc/issues/85
1283 trim_file_name = self.params.get('trim_file_name', False)
1284 if trim_file_name:
1285 no_ext, *ext = filename.rsplit('.', 2)
1286 filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
1287
1288 return filename
1289 except ValueError as err:
1290 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1291 return None
1292
1293 def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
1294 """Generate the output filename"""
1295 if outtmpl:
1296 assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
1297 dir_type = None
1298 filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
1299 if not filename and dir_type not in ('', 'temp'):
1300 return ''
1301
1302 if warn:
1303 if not self.params.get('paths'):
1304 pass
1305 elif filename == '-':
1306 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1307 elif os.path.isabs(filename):
1308 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1309 if filename == '-' or not filename:
1310 return filename
1311
1312 return self.get_output_path(dir_type, filename)
1313
1314 def _match_entry(self, info_dict, incomplete=False, silent=False):
1315 """ Returns None if the file should be downloaded """
1316
1317 video_title = info_dict.get('title', info_dict.get('id', 'entry'))
1318
1319 def check_filter():
1320 if 'title' in info_dict:
1321 # This can happen when we're just evaluating the playlist
1322 title = info_dict['title']
1323 matchtitle = self.params.get('matchtitle', False)
1324 if matchtitle:
1325 if not re.search(matchtitle, title, re.IGNORECASE):
1326 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1327 rejecttitle = self.params.get('rejecttitle', False)
1328 if rejecttitle:
1329 if re.search(rejecttitle, title, re.IGNORECASE):
1330 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1331 date = info_dict.get('upload_date')
1332 if date is not None:
1333 dateRange = self.params.get('daterange', DateRange())
1334 if date not in dateRange:
1335 return f'{date_from_str(date).isoformat()} upload date is not in range {dateRange}'
1336 view_count = info_dict.get('view_count')
1337 if view_count is not None:
1338 min_views = self.params.get('min_views')
1339 if min_views is not None and view_count < min_views:
1340 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1341 max_views = self.params.get('max_views')
1342 if max_views is not None and view_count > max_views:
1343 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1344 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1345 return 'Skipping "%s" because it is age restricted' % video_title
1346
1347 match_filter = self.params.get('match_filter')
1348 if match_filter is not None:
1349 try:
1350 ret = match_filter(info_dict, incomplete=incomplete)
1351 except TypeError:
1352 # For backward compatibility
1353 ret = None if incomplete else match_filter(info_dict)
1354 if ret is NO_DEFAULT:
1355 while True:
1356 filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
1357 reply = input(self._format_screen(
1358 f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
1359 if reply in {'y', ''}:
1360 return None
1361 elif reply == 'n':
1362 return f'Skipping {video_title}'
1363 elif ret is not None:
1364 return ret
1365 return None
1366
1367 if self.in_download_archive(info_dict):
1368 reason = '%s has already been recorded in the archive' % video_title
1369 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1370 else:
1371 reason = check_filter()
1372 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1373 if reason is not None:
1374 if not silent:
1375 self.to_screen('[download] ' + reason)
1376 if self.params.get(break_opt, False):
1377 raise break_err()
1378 return reason
1379
1380 @staticmethod
1381 def add_extra_info(info_dict, extra_info):
1382 '''Set the keys from extra_info in info dict if they are missing'''
1383 for key, value in extra_info.items():
1384 info_dict.setdefault(key, value)
1385
1386 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1387 process=True, force_generic_extractor=False):
1388 """
1389 Return a list with a dictionary for each video extracted.
1390
1391 Arguments:
1392 url -- URL to extract
1393
1394 Keyword arguments:
1395 download -- whether to download videos during extraction
1396 ie_key -- extractor key hint
1397 extra_info -- dictionary containing the extra values to add to each result
1398 process -- whether to resolve all unresolved references (URLs, playlist items),
1399 must be True for download to work.
1400 force_generic_extractor -- force using the generic extractor
1401 """
1402
1403 if extra_info is None:
1404 extra_info = {}
1405
1406 if not ie_key and force_generic_extractor:
1407 ie_key = 'Generic'
1408
1409 if ie_key:
1410 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1411 else:
1412 ies = self._ies
1413
1414 for ie_key, ie in ies.items():
1415 if not ie.suitable(url):
1416 continue
1417
1418 if not ie.working():
1419 self.report_warning('The program functionality for this site has been marked as broken, '
1420 'and will probably not work.')
1421
1422 temp_id = ie.get_temp_id(url)
1423 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1424 self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
1425 if self.params.get('break_on_existing', False):
1426 raise ExistingVideoReached()
1427 break
1428 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1429 else:
1430 self.report_error('no suitable InfoExtractor for URL %s' % url)
1431
1432 def _handle_extraction_exceptions(func):
1433 @functools.wraps(func)
1434 def wrapper(self, *args, **kwargs):
1435 while True:
1436 try:
1437 return func(self, *args, **kwargs)
1438 except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
1439 raise
1440 except ReExtractInfo as e:
1441 if e.expected:
1442 self.to_screen(f'{e}; Re-extracting data')
1443 else:
1444 self.to_stderr('\r')
1445 self.report_warning(f'{e}; Re-extracting data')
1446 continue
1447 except GeoRestrictedError as e:
1448 msg = e.msg
1449 if e.countries:
1450 msg += '\nThis video is available in %s.' % ', '.join(
1451 map(ISO3166Utils.short2full, e.countries))
1452 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1453 self.report_error(msg)
1454 except ExtractorError as e: # An error we somewhat expected
1455 self.report_error(str(e), e.format_traceback())
1456 except Exception as e:
1457 if self.params.get('ignoreerrors'):
1458 self.report_error(str(e), tb=encode_compat_str(traceback.format_exc()))
1459 else:
1460 raise
1461 break
1462 return wrapper
1463
1464 def _wait_for_video(self, ie_result={}):
1465 if (not self.params.get('wait_for_video')
1466 or ie_result.get('_type', 'video') != 'video'
1467 or ie_result.get('formats') or ie_result.get('url')):
1468 return
1469
1470 format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
1471 last_msg = ''
1472
1473 def progress(msg):
1474 nonlocal last_msg
1475 full_msg = f'{msg}\n'
1476 if not self.params.get('noprogress'):
1477 full_msg = msg + ' ' * (len(last_msg) - len(msg)) + '\r'
1478 elif last_msg:
1479 return
1480 self.to_screen(full_msg, skip_eol=True)
1481 last_msg = msg
1482
1483 min_wait, max_wait = self.params.get('wait_for_video')
1484 diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
1485 if diff is None and ie_result.get('live_status') == 'is_upcoming':
1486 diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
1487 self.report_warning('Release time of video is not known')
1488 elif ie_result and (diff or 0) <= 0:
1489 self.report_warning('Video should already be available according to extracted info')
1490 diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
1491 self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
1492
1493 wait_till = time.time() + diff
1494 try:
1495 while True:
1496 diff = wait_till - time.time()
1497 if diff <= 0:
1498 progress('')
1499 raise ReExtractInfo('[wait] Wait period ended', expected=True)
1500 progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
1501 time.sleep(1)
1502 except KeyboardInterrupt:
1503 progress('')
1504 raise ReExtractInfo('[wait] Interrupted by user', expected=True)
1505 except BaseException as e:
1506 if not isinstance(e, ReExtractInfo):
1507 self.to_screen('')
1508 raise
1509
1510 @_handle_extraction_exceptions
1511 def __extract_info(self, url, ie, download, extra_info, process):
1512 try:
1513 ie_result = ie.extract(url)
1514 except UserNotLive as e:
1515 if process:
1516 if self.params.get('wait_for_video'):
1517 self.report_warning(e)
1518 self._wait_for_video()
1519 raise
1520 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1521 self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}')
1522 return
1523 if isinstance(ie_result, list):
1524 # Backwards compatibility: old IE result format
1525 ie_result = {
1526 '_type': 'compat_list',
1527 'entries': ie_result,
1528 }
1529 if extra_info.get('original_url'):
1530 ie_result.setdefault('original_url', extra_info['original_url'])
1531 self.add_default_extra_info(ie_result, ie, url)
1532 if process:
1533 self._wait_for_video(ie_result)
1534 return self.process_ie_result(ie_result, download, extra_info)
1535 else:
1536 return ie_result
1537
1538 def add_default_extra_info(self, ie_result, ie, url):
1539 if url is not None:
1540 self.add_extra_info(ie_result, {
1541 'webpage_url': url,
1542 'original_url': url,
1543 })
1544 webpage_url = ie_result.get('webpage_url')
1545 if webpage_url:
1546 self.add_extra_info(ie_result, {
1547 'webpage_url_basename': url_basename(webpage_url),
1548 'webpage_url_domain': get_domain(webpage_url),
1549 })
1550 if ie is not None:
1551 self.add_extra_info(ie_result, {
1552 'extractor': ie.IE_NAME,
1553 'extractor_key': ie.ie_key(),
1554 })
1555
1556 def process_ie_result(self, ie_result, download=True, extra_info=None):
1557 """
1558 Take the result of the ie(may be modified) and resolve all unresolved
1559 references (URLs, playlist items).
1560
1561 It will also download the videos if 'download'.
1562 Returns the resolved ie_result.
1563 """
1564 if extra_info is None:
1565 extra_info = {}
1566 result_type = ie_result.get('_type', 'video')
1567
1568 if result_type in ('url', 'url_transparent'):
1569 ie_result['url'] = sanitize_url(
1570 ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
1571 if ie_result.get('original_url'):
1572 extra_info.setdefault('original_url', ie_result['original_url'])
1573
1574 extract_flat = self.params.get('extract_flat', False)
1575 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1576 or extract_flat is True):
1577 info_copy = ie_result.copy()
1578 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1579 if ie and not ie_result.get('id'):
1580 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1581 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1582 self.add_extra_info(info_copy, extra_info)
1583 info_copy, _ = self.pre_process(info_copy)
1584 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1585 self._raise_pending_errors(info_copy)
1586 if self.params.get('force_write_download_archive', False):
1587 self.record_download_archive(info_copy)
1588 return ie_result
1589
1590 if result_type == 'video':
1591 self.add_extra_info(ie_result, extra_info)
1592 ie_result = self.process_video_result(ie_result, download=download)
1593 self._raise_pending_errors(ie_result)
1594 additional_urls = (ie_result or {}).get('additional_urls')
1595 if additional_urls:
1596 # TODO: Improve MetadataParserPP to allow setting a list
1597 if isinstance(additional_urls, str):
1598 additional_urls = [additional_urls]
1599 self.to_screen(
1600 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1601 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1602 ie_result['additional_entries'] = [
1603 self.extract_info(
1604 url, download, extra_info=extra_info,
1605 force_generic_extractor=self.params.get('force_generic_extractor'))
1606 for url in additional_urls
1607 ]
1608 return ie_result
1609 elif result_type == 'url':
1610 # We have to add extra_info to the results because it may be
1611 # contained in a playlist
1612 return self.extract_info(
1613 ie_result['url'], download,
1614 ie_key=ie_result.get('ie_key'),
1615 extra_info=extra_info)
1616 elif result_type == 'url_transparent':
1617 # Use the information from the embedding page
1618 info = self.extract_info(
1619 ie_result['url'], ie_key=ie_result.get('ie_key'),
1620 extra_info=extra_info, download=False, process=False)
1621
1622 # extract_info may return None when ignoreerrors is enabled and
1623 # extraction failed with an error, don't crash and return early
1624 # in this case
1625 if not info:
1626 return info
1627
1628 exempted_fields = {'_type', 'url', 'ie_key'}
1629 if not ie_result.get('section_end') and ie_result.get('section_start') is None:
1630 # For video clips, the id etc of the clip extractor should be used
1631 exempted_fields |= {'id', 'extractor', 'extractor_key'}
1632
1633 new_result = info.copy()
1634 new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields))
1635
1636 # Extracted info may not be a video result (i.e.
1637 # info.get('_type', 'video') != video) but rather an url or
1638 # url_transparent. In such cases outer metadata (from ie_result)
1639 # should be propagated to inner one (info). For this to happen
1640 # _type of info should be overridden with url_transparent. This
1641 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1642 if new_result.get('_type') == 'url':
1643 new_result['_type'] = 'url_transparent'
1644
1645 return self.process_ie_result(
1646 new_result, download=download, extra_info=extra_info)
1647 elif result_type in ('playlist', 'multi_video'):
1648 # Protect from infinite recursion due to recursively nested playlists
1649 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1650 webpage_url = ie_result['webpage_url']
1651 if webpage_url in self._playlist_urls:
1652 self.to_screen(
1653 '[download] Skipping already downloaded playlist: %s'
1654 % ie_result.get('title') or ie_result.get('id'))
1655 return
1656
1657 self._playlist_level += 1
1658 self._playlist_urls.add(webpage_url)
1659 self._fill_common_fields(ie_result, False)
1660 self._sanitize_thumbnails(ie_result)
1661 try:
1662 return self.__process_playlist(ie_result, download)
1663 finally:
1664 self._playlist_level -= 1
1665 if not self._playlist_level:
1666 self._playlist_urls.clear()
1667 elif result_type == 'compat_list':
1668 self.report_warning(
1669 'Extractor %s returned a compat_list result. '
1670 'It needs to be updated.' % ie_result.get('extractor'))
1671
1672 def _fixup(r):
1673 self.add_extra_info(r, {
1674 'extractor': ie_result['extractor'],
1675 'webpage_url': ie_result['webpage_url'],
1676 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1677 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1678 'extractor_key': ie_result['extractor_key'],
1679 })
1680 return r
1681 ie_result['entries'] = [
1682 self.process_ie_result(_fixup(r), download, extra_info)
1683 for r in ie_result['entries']
1684 ]
1685 return ie_result
1686 else:
1687 raise Exception('Invalid result type: %s' % result_type)
1688
1689 def _ensure_dir_exists(self, path):
1690 return make_dir(path, self.report_error)
1691
1692 @staticmethod
1693 def _playlist_infodict(ie_result, strict=False, **kwargs):
1694 info = {
1695 'playlist_count': ie_result.get('playlist_count'),
1696 'playlist': ie_result.get('title') or ie_result.get('id'),
1697 'playlist_id': ie_result.get('id'),
1698 'playlist_title': ie_result.get('title'),
1699 'playlist_uploader': ie_result.get('uploader'),
1700 'playlist_uploader_id': ie_result.get('uploader_id'),
1701 **kwargs,
1702 }
1703 if strict:
1704 return info
1705 return {
1706 **info,
1707 'playlist_index': 0,
1708 '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
1709 'extractor': ie_result['extractor'],
1710 'webpage_url': ie_result['webpage_url'],
1711 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1712 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1713 'extractor_key': ie_result['extractor_key'],
1714 }
1715
1716 def __process_playlist(self, ie_result, download):
1717 """Process each entry in the playlist"""
1718 assert ie_result['_type'] in ('playlist', 'multi_video')
1719
1720 common_info = self._playlist_infodict(ie_result, strict=True)
1721 title = common_info.get('playlist') or '<Untitled>'
1722 if self._match_entry(common_info, incomplete=True) is not None:
1723 return
1724 self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}')
1725
1726 all_entries = PlaylistEntries(self, ie_result)
1727 entries = orderedSet(all_entries.get_requested_items(), lazy=True)
1728
1729 lazy = self.params.get('lazy_playlist')
1730 if lazy:
1731 resolved_entries, n_entries = [], 'N/A'
1732 ie_result['requested_entries'], ie_result['entries'] = None, None
1733 else:
1734 entries = resolved_entries = list(entries)
1735 n_entries = len(resolved_entries)
1736 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1737 if not ie_result.get('playlist_count'):
1738 # Better to do this after potentially exhausting entries
1739 ie_result['playlist_count'] = all_entries.get_full_count()
1740
1741 ie_copy = collections.ChainMap(
1742 ie_result, self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries)))
1743
1744 _infojson_written = False
1745 write_playlist_files = self.params.get('allow_playlist_files', True)
1746 if write_playlist_files and self.params.get('list_thumbnails'):
1747 self.list_thumbnails(ie_result)
1748 if write_playlist_files and not self.params.get('simulate'):
1749 _infojson_written = self._write_info_json(
1750 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
1751 if _infojson_written is None:
1752 return
1753 if self._write_description('playlist', ie_result,
1754 self.prepare_filename(ie_copy, 'pl_description')) is None:
1755 return
1756 # TODO: This should be passed to ThumbnailsConvertor if necessary
1757 self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1758
1759 if lazy:
1760 if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
1761 self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True)
1762 elif self.params.get('playlistreverse'):
1763 entries.reverse()
1764 elif self.params.get('playlistrandom'):
1765 random.shuffle(entries)
1766
1767 self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} videos'
1768 f'{format_field(ie_result, "playlist_count", " of %s")}')
1769
1770 keep_resolved_entries = self.params.get('extract_flat') != 'discard'
1771 if self.params.get('extract_flat') == 'discard_in_playlist':
1772 keep_resolved_entries = ie_result['_type'] != 'playlist'
1773 if keep_resolved_entries:
1774 self.write_debug('The information of all playlist entries will be held in memory')
1775
1776 failures = 0
1777 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1778 for i, (playlist_index, entry) in enumerate(entries):
1779 if lazy:
1780 resolved_entries.append((playlist_index, entry))
1781 if not entry:
1782 continue
1783
1784 entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
1785 if not lazy and 'playlist-index' in self.params.get('compat_opts', []):
1786 playlist_index = ie_result['requested_entries'][i]
1787
1788 extra = {
1789 **common_info,
1790 'n_entries': int_or_none(n_entries),
1791 'playlist_index': playlist_index,
1792 'playlist_autonumber': i + 1,
1793 }
1794
1795 if self._match_entry(collections.ChainMap(entry, extra), incomplete=True) is not None:
1796 continue
1797
1798 self.to_screen('[download] Downloading video %s of %s' % (
1799 self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
1800
1801 entry_result = self.__process_iterable_entry(entry, download, extra)
1802 if not entry_result:
1803 failures += 1
1804 if failures >= max_failures:
1805 self.report_error(
1806 f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction')
1807 break
1808 if keep_resolved_entries:
1809 resolved_entries[i] = (playlist_index, entry_result)
1810
1811 # Update with processed data
1812 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1813
1814 # Write the updated info to json
1815 if _infojson_written is True and self._write_info_json(
1816 'updated playlist', ie_result,
1817 self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
1818 return
1819
1820 ie_result = self.run_all_pps('playlist', ie_result)
1821 self.to_screen(f'[download] Finished downloading playlist: {title}')
1822 return ie_result
1823
1824 @_handle_extraction_exceptions
1825 def __process_iterable_entry(self, entry, download, extra_info):
1826 return self.process_ie_result(
1827 entry, download=download, extra_info=extra_info)
1828
1829 def _build_format_filter(self, filter_spec):
1830 " Returns a function to filter the formats according to the filter_spec "
1831
1832 OPERATORS = {
1833 '<': operator.lt,
1834 '<=': operator.le,
1835 '>': operator.gt,
1836 '>=': operator.ge,
1837 '=': operator.eq,
1838 '!=': operator.ne,
1839 }
1840 operator_rex = re.compile(r'''(?x)\s*
1841 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1842 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1843 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1844 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1845 m = operator_rex.fullmatch(filter_spec)
1846 if m:
1847 try:
1848 comparison_value = int(m.group('value'))
1849 except ValueError:
1850 comparison_value = parse_filesize(m.group('value'))
1851 if comparison_value is None:
1852 comparison_value = parse_filesize(m.group('value') + 'B')
1853 if comparison_value is None:
1854 raise ValueError(
1855 'Invalid value %r in format specification %r' % (
1856 m.group('value'), filter_spec))
1857 op = OPERATORS[m.group('op')]
1858
1859 if not m:
1860 STR_OPERATORS = {
1861 '=': operator.eq,
1862 '^=': lambda attr, value: attr.startswith(value),
1863 '$=': lambda attr, value: attr.endswith(value),
1864 '*=': lambda attr, value: value in attr,
1865 '~=': lambda attr, value: value.search(attr) is not None
1866 }
1867 str_operator_rex = re.compile(r'''(?x)\s*
1868 (?P<key>[a-zA-Z0-9._-]+)\s*
1869 (?P<negation>!\s*)?(?P<op>%s)\s*(?P<none_inclusive>\?\s*)?
1870 (?P<quote>["'])?
1871 (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+))
1872 (?(quote)(?P=quote))\s*
1873 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1874 m = str_operator_rex.fullmatch(filter_spec)
1875 if m:
1876 if m.group('op') == '~=':
1877 comparison_value = re.compile(m.group('value'))
1878 else:
1879 comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value'))
1880 str_op = STR_OPERATORS[m.group('op')]
1881 if m.group('negation'):
1882 op = lambda attr, value: not str_op(attr, value)
1883 else:
1884 op = str_op
1885
1886 if not m:
1887 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1888
1889 def _filter(f):
1890 actual_value = f.get(m.group('key'))
1891 if actual_value is None:
1892 return m.group('none_inclusive')
1893 return op(actual_value, comparison_value)
1894 return _filter
1895
1896 def _check_formats(self, formats):
1897 for f in formats:
1898 self.to_screen('[info] Testing format %s' % f['format_id'])
1899 path = self.get_output_path('temp')
1900 if not self._ensure_dir_exists(f'{path}/'):
1901 continue
1902 temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
1903 temp_file.close()
1904 try:
1905 success, _ = self.dl(temp_file.name, f, test=True)
1906 except (DownloadError, OSError, ValueError) + network_exceptions:
1907 success = False
1908 finally:
1909 if os.path.exists(temp_file.name):
1910 try:
1911 os.remove(temp_file.name)
1912 except OSError:
1913 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1914 if success:
1915 yield f
1916 else:
1917 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1918
1919 def _default_format_spec(self, info_dict, download=True):
1920
1921 def can_merge():
1922 merger = FFmpegMergerPP(self)
1923 return merger.available and merger.can_merge()
1924
1925 prefer_best = (
1926 not self.params.get('simulate')
1927 and download
1928 and (
1929 not can_merge()
1930 or info_dict.get('is_live') and not self.params.get('live_from_start')
1931 or self.params['outtmpl']['default'] == '-'))
1932 compat = (
1933 prefer_best
1934 or self.params.get('allow_multiple_audio_streams', False)
1935 or 'format-spec' in self.params['compat_opts'])
1936
1937 return (
1938 'best/bestvideo+bestaudio' if prefer_best
1939 else 'bestvideo*+bestaudio/best' if not compat
1940 else 'bestvideo+bestaudio/best')
1941
1942 def build_format_selector(self, format_spec):
1943 def syntax_error(note, start):
1944 message = (
1945 'Invalid format specification: '
1946 '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1]))
1947 return SyntaxError(message)
1948
1949 PICKFIRST = 'PICKFIRST'
1950 MERGE = 'MERGE'
1951 SINGLE = 'SINGLE'
1952 GROUP = 'GROUP'
1953 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1954
1955 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1956 'video': self.params.get('allow_multiple_video_streams', False)}
1957
1958 check_formats = self.params.get('check_formats') == 'selected'
1959
1960 def _parse_filter(tokens):
1961 filter_parts = []
1962 for type, string, start, _, _ in tokens:
1963 if type == tokenize.OP and string == ']':
1964 return ''.join(filter_parts)
1965 else:
1966 filter_parts.append(string)
1967
1968 def _remove_unused_ops(tokens):
1969 # Remove operators that we don't use and join them with the surrounding strings
1970 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1971 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1972 last_string, last_start, last_end, last_line = None, None, None, None
1973 for type, string, start, end, line in tokens:
1974 if type == tokenize.OP and string == '[':
1975 if last_string:
1976 yield tokenize.NAME, last_string, last_start, last_end, last_line
1977 last_string = None
1978 yield type, string, start, end, line
1979 # everything inside brackets will be handled by _parse_filter
1980 for type, string, start, end, line in tokens:
1981 yield type, string, start, end, line
1982 if type == tokenize.OP and string == ']':
1983 break
1984 elif type == tokenize.OP and string in ALLOWED_OPS:
1985 if last_string:
1986 yield tokenize.NAME, last_string, last_start, last_end, last_line
1987 last_string = None
1988 yield type, string, start, end, line
1989 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1990 if not last_string:
1991 last_string = string
1992 last_start = start
1993 last_end = end
1994 else:
1995 last_string += string
1996 if last_string:
1997 yield tokenize.NAME, last_string, last_start, last_end, last_line
1998
1999 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
2000 selectors = []
2001 current_selector = None
2002 for type, string, start, _, _ in tokens:
2003 # ENCODING is only defined in python 3.x
2004 if type == getattr(tokenize, 'ENCODING', None):
2005 continue
2006 elif type in [tokenize.NAME, tokenize.NUMBER]:
2007 current_selector = FormatSelector(SINGLE, string, [])
2008 elif type == tokenize.OP:
2009 if string == ')':
2010 if not inside_group:
2011 # ')' will be handled by the parentheses group
2012 tokens.restore_last_token()
2013 break
2014 elif inside_merge and string in ['/', ',']:
2015 tokens.restore_last_token()
2016 break
2017 elif inside_choice and string == ',':
2018 tokens.restore_last_token()
2019 break
2020 elif string == ',':
2021 if not current_selector:
2022 raise syntax_error('"," must follow a format selector', start)
2023 selectors.append(current_selector)
2024 current_selector = None
2025 elif string == '/':
2026 if not current_selector:
2027 raise syntax_error('"/" must follow a format selector', start)
2028 first_choice = current_selector
2029 second_choice = _parse_format_selection(tokens, inside_choice=True)
2030 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
2031 elif string == '[':
2032 if not current_selector:
2033 current_selector = FormatSelector(SINGLE, 'best', [])
2034 format_filter = _parse_filter(tokens)
2035 current_selector.filters.append(format_filter)
2036 elif string == '(':
2037 if current_selector:
2038 raise syntax_error('Unexpected "("', start)
2039 group = _parse_format_selection(tokens, inside_group=True)
2040 current_selector = FormatSelector(GROUP, group, [])
2041 elif string == '+':
2042 if not current_selector:
2043 raise syntax_error('Unexpected "+"', start)
2044 selector_1 = current_selector
2045 selector_2 = _parse_format_selection(tokens, inside_merge=True)
2046 if not selector_2:
2047 raise syntax_error('Expected a selector', start)
2048 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
2049 else:
2050 raise syntax_error(f'Operator not recognized: "{string}"', start)
2051 elif type == tokenize.ENDMARKER:
2052 break
2053 if current_selector:
2054 selectors.append(current_selector)
2055 return selectors
2056
2057 def _merge(formats_pair):
2058 format_1, format_2 = formats_pair
2059
2060 formats_info = []
2061 formats_info.extend(format_1.get('requested_formats', (format_1,)))
2062 formats_info.extend(format_2.get('requested_formats', (format_2,)))
2063
2064 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
2065 get_no_more = {'video': False, 'audio': False}
2066 for (i, fmt_info) in enumerate(formats_info):
2067 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
2068 formats_info.pop(i)
2069 continue
2070 for aud_vid in ['audio', 'video']:
2071 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
2072 if get_no_more[aud_vid]:
2073 formats_info.pop(i)
2074 break
2075 get_no_more[aud_vid] = True
2076
2077 if len(formats_info) == 1:
2078 return formats_info[0]
2079
2080 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
2081 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
2082
2083 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
2084 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
2085
2086 output_ext = self.params.get('merge_output_format')
2087 if not output_ext:
2088 if the_only_video:
2089 output_ext = the_only_video['ext']
2090 elif the_only_audio and not video_fmts:
2091 output_ext = the_only_audio['ext']
2092 else:
2093 output_ext = 'mkv'
2094
2095 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
2096
2097 new_dict = {
2098 'requested_formats': formats_info,
2099 'format': '+'.join(filtered('format')),
2100 'format_id': '+'.join(filtered('format_id')),
2101 'ext': output_ext,
2102 'protocol': '+'.join(map(determine_protocol, formats_info)),
2103 'language': '+'.join(orderedSet(filtered('language'))) or None,
2104 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
2105 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
2106 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
2107 }
2108
2109 if the_only_video:
2110 new_dict.update({
2111 'width': the_only_video.get('width'),
2112 'height': the_only_video.get('height'),
2113 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
2114 'fps': the_only_video.get('fps'),
2115 'dynamic_range': the_only_video.get('dynamic_range'),
2116 'vcodec': the_only_video.get('vcodec'),
2117 'vbr': the_only_video.get('vbr'),
2118 'stretched_ratio': the_only_video.get('stretched_ratio'),
2119 })
2120
2121 if the_only_audio:
2122 new_dict.update({
2123 'acodec': the_only_audio.get('acodec'),
2124 'abr': the_only_audio.get('abr'),
2125 'asr': the_only_audio.get('asr'),
2126 })
2127
2128 return new_dict
2129
2130 def _check_formats(formats):
2131 if not check_formats:
2132 yield from formats
2133 return
2134 yield from self._check_formats(formats)
2135
2136 def _build_selector_function(selector):
2137 if isinstance(selector, list): # ,
2138 fs = [_build_selector_function(s) for s in selector]
2139
2140 def selector_function(ctx):
2141 for f in fs:
2142 yield from f(ctx)
2143 return selector_function
2144
2145 elif selector.type == GROUP: # ()
2146 selector_function = _build_selector_function(selector.selector)
2147
2148 elif selector.type == PICKFIRST: # /
2149 fs = [_build_selector_function(s) for s in selector.selector]
2150
2151 def selector_function(ctx):
2152 for f in fs:
2153 picked_formats = list(f(ctx))
2154 if picked_formats:
2155 return picked_formats
2156 return []
2157
2158 elif selector.type == MERGE: # +
2159 selector_1, selector_2 = map(_build_selector_function, selector.selector)
2160
2161 def selector_function(ctx):
2162 for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
2163 yield _merge(pair)
2164
2165 elif selector.type == SINGLE: # atom
2166 format_spec = selector.selector or 'best'
2167
2168 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2169 if format_spec == 'all':
2170 def selector_function(ctx):
2171 yield from _check_formats(ctx['formats'][::-1])
2172 elif format_spec == 'mergeall':
2173 def selector_function(ctx):
2174 formats = list(_check_formats(
2175 f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none'))
2176 if not formats:
2177 return
2178 merged_format = formats[-1]
2179 for f in formats[-2::-1]:
2180 merged_format = _merge((merged_format, f))
2181 yield merged_format
2182
2183 else:
2184 format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
2185 mobj = re.match(
2186 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2187 format_spec)
2188 if mobj is not None:
2189 format_idx = int_or_none(mobj.group('n'), default=1)
2190 format_reverse = mobj.group('bw')[0] == 'b'
2191 format_type = (mobj.group('type') or [None])[0]
2192 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2193 format_modified = mobj.group('mod') is not None
2194
2195 format_fallback = not format_type and not format_modified # for b, w
2196 _filter_f = (
2197 (lambda f: f.get('%scodec' % format_type) != 'none')
2198 if format_type and format_modified # bv*, ba*, wv*, wa*
2199 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
2200 if format_type # bv, ba, wv, wa
2201 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2202 if not format_modified # b, w
2203 else lambda f: True) # b*, w*
2204 filter_f = lambda f: _filter_f(f) and (
2205 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2206 else:
2207 if format_spec in self._format_selection_exts['audio']:
2208 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2209 elif format_spec in self._format_selection_exts['video']:
2210 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2211 seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
2212 elif format_spec in self._format_selection_exts['storyboards']:
2213 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2214 else:
2215 filter_f = lambda f: f.get('format_id') == format_spec # id
2216
2217 def selector_function(ctx):
2218 formats = list(ctx['formats'])
2219 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2220 if not matches:
2221 if format_fallback and ctx['incomplete_formats']:
2222 # for extractors with incomplete formats (audio only (soundcloud)
2223 # or video only (imgur)) best/worst will fallback to
2224 # best/worst {video,audio}-only format
2225 matches = formats
2226 elif seperate_fallback and not ctx['has_merged_format']:
2227 # for compatibility with youtube-dl when there is no pre-merged format
2228 matches = list(filter(seperate_fallback, formats))
2229 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2230 try:
2231 yield matches[format_idx - 1]
2232 except LazyList.IndexError:
2233 return
2234
2235 filters = [self._build_format_filter(f) for f in selector.filters]
2236
2237 def final_selector(ctx):
2238 ctx_copy = dict(ctx)
2239 for _filter in filters:
2240 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2241 return selector_function(ctx_copy)
2242 return final_selector
2243
2244 stream = io.BytesIO(format_spec.encode())
2245 try:
2246 tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
2247 except tokenize.TokenError:
2248 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2249
2250 class TokenIterator:
2251 def __init__(self, tokens):
2252 self.tokens = tokens
2253 self.counter = 0
2254
2255 def __iter__(self):
2256 return self
2257
2258 def __next__(self):
2259 if self.counter >= len(self.tokens):
2260 raise StopIteration()
2261 value = self.tokens[self.counter]
2262 self.counter += 1
2263 return value
2264
2265 next = __next__
2266
2267 def restore_last_token(self):
2268 self.counter -= 1
2269
2270 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2271 return _build_selector_function(parsed_selector)
2272
2273 def _calc_headers(self, info_dict):
2274 res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
2275
2276 cookies = self._calc_cookies(info_dict['url'])
2277 if cookies:
2278 res['Cookie'] = cookies
2279
2280 if 'X-Forwarded-For' not in res:
2281 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2282 if x_forwarded_for_ip:
2283 res['X-Forwarded-For'] = x_forwarded_for_ip
2284
2285 return res
2286
2287 def _calc_cookies(self, url):
2288 pr = sanitized_Request(url)
2289 self.cookiejar.add_cookie_header(pr)
2290 return pr.get_header('Cookie')
2291
2292 def _sort_thumbnails(self, thumbnails):
2293 thumbnails.sort(key=lambda t: (
2294 t.get('preference') if t.get('preference') is not None else -1,
2295 t.get('width') if t.get('width') is not None else -1,
2296 t.get('height') if t.get('height') is not None else -1,
2297 t.get('id') if t.get('id') is not None else '',
2298 t.get('url')))
2299
2300 def _sanitize_thumbnails(self, info_dict):
2301 thumbnails = info_dict.get('thumbnails')
2302 if thumbnails is None:
2303 thumbnail = info_dict.get('thumbnail')
2304 if thumbnail:
2305 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2306 if not thumbnails:
2307 return
2308
2309 def check_thumbnails(thumbnails):
2310 for t in thumbnails:
2311 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2312 try:
2313 self.urlopen(HEADRequest(t['url']))
2314 except network_exceptions as err:
2315 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2316 continue
2317 yield t
2318
2319 self._sort_thumbnails(thumbnails)
2320 for i, t in enumerate(thumbnails):
2321 if t.get('id') is None:
2322 t['id'] = '%d' % i
2323 if t.get('width') and t.get('height'):
2324 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2325 t['url'] = sanitize_url(t['url'])
2326
2327 if self.params.get('check_formats') is True:
2328 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
2329 else:
2330 info_dict['thumbnails'] = thumbnails
2331
2332 def _fill_common_fields(self, info_dict, is_video=True):
2333 # TODO: move sanitization here
2334 if is_video:
2335 # playlists are allowed to lack "title"
2336 title = info_dict.get('title', NO_DEFAULT)
2337 if title is NO_DEFAULT:
2338 raise ExtractorError('Missing "title" field in extractor result',
2339 video_id=info_dict['id'], ie=info_dict['extractor'])
2340 info_dict['fulltitle'] = title
2341 if not title:
2342 if title == '':
2343 self.write_debug('Extractor gave empty title. Creating a generic title')
2344 else:
2345 self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
2346 info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}'
2347
2348 if info_dict.get('duration') is not None:
2349 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2350
2351 for ts_key, date_key in (
2352 ('timestamp', 'upload_date'),
2353 ('release_timestamp', 'release_date'),
2354 ('modified_timestamp', 'modified_date'),
2355 ):
2356 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2357 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2358 # see http://bugs.python.org/issue1646728)
2359 with contextlib.suppress(ValueError, OverflowError, OSError):
2360 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2361 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2362
2363 live_keys = ('is_live', 'was_live')
2364 live_status = info_dict.get('live_status')
2365 if live_status is None:
2366 for key in live_keys:
2367 if info_dict.get(key) is False:
2368 continue
2369 if info_dict.get(key):
2370 live_status = key
2371 break
2372 if all(info_dict.get(key) is False for key in live_keys):
2373 live_status = 'not_live'
2374 if live_status:
2375 info_dict['live_status'] = live_status
2376 for key in live_keys:
2377 if info_dict.get(key) is None:
2378 info_dict[key] = (live_status == key)
2379
2380 # Auto generate title fields corresponding to the *_number fields when missing
2381 # in order to always have clean titles. This is very common for TV series.
2382 for field in ('chapter', 'season', 'episode'):
2383 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2384 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2385
2386 def _raise_pending_errors(self, info):
2387 err = info.pop('__pending_error', None)
2388 if err:
2389 self.report_error(err, tb=False)
2390
2391 def process_video_result(self, info_dict, download=True):
2392 assert info_dict.get('_type', 'video') == 'video'
2393 self._num_videos += 1
2394
2395 if 'id' not in info_dict:
2396 raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
2397 elif not info_dict.get('id'):
2398 raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
2399
2400 def report_force_conversion(field, field_not, conversion):
2401 self.report_warning(
2402 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2403 % (field, field_not, conversion))
2404
2405 def sanitize_string_field(info, string_field):
2406 field = info.get(string_field)
2407 if field is None or isinstance(field, str):
2408 return
2409 report_force_conversion(string_field, 'a string', 'string')
2410 info[string_field] = str(field)
2411
2412 def sanitize_numeric_fields(info):
2413 for numeric_field in self._NUMERIC_FIELDS:
2414 field = info.get(numeric_field)
2415 if field is None or isinstance(field, (int, float)):
2416 continue
2417 report_force_conversion(numeric_field, 'numeric', 'int')
2418 info[numeric_field] = int_or_none(field)
2419
2420 sanitize_string_field(info_dict, 'id')
2421 sanitize_numeric_fields(info_dict)
2422 if info_dict.get('section_end') and info_dict.get('section_start') is not None:
2423 info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3)
2424 if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
2425 self.report_warning('"duration" field is negative, there is an error in extractor')
2426
2427 chapters = info_dict.get('chapters') or []
2428 if chapters and chapters[0].get('start_time'):
2429 chapters.insert(0, {'start_time': 0})
2430
2431 dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')}
2432 for idx, (prev, current, next_) in enumerate(zip(
2433 (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1):
2434 if current.get('start_time') is None:
2435 current['start_time'] = prev.get('end_time')
2436 if not current.get('end_time'):
2437 current['end_time'] = next_.get('start_time')
2438 if not current.get('title'):
2439 current['title'] = f'<Untitled Chapter {idx}>'
2440
2441 if 'playlist' not in info_dict:
2442 # It isn't part of a playlist
2443 info_dict['playlist'] = None
2444 info_dict['playlist_index'] = None
2445
2446 self._sanitize_thumbnails(info_dict)
2447
2448 thumbnail = info_dict.get('thumbnail')
2449 thumbnails = info_dict.get('thumbnails')
2450 if thumbnail:
2451 info_dict['thumbnail'] = sanitize_url(thumbnail)
2452 elif thumbnails:
2453 info_dict['thumbnail'] = thumbnails[-1]['url']
2454
2455 if info_dict.get('display_id') is None and 'id' in info_dict:
2456 info_dict['display_id'] = info_dict['id']
2457
2458 self._fill_common_fields(info_dict)
2459
2460 for cc_kind in ('subtitles', 'automatic_captions'):
2461 cc = info_dict.get(cc_kind)
2462 if cc:
2463 for _, subtitle in cc.items():
2464 for subtitle_format in subtitle:
2465 if subtitle_format.get('url'):
2466 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2467 if subtitle_format.get('ext') is None:
2468 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2469
2470 automatic_captions = info_dict.get('automatic_captions')
2471 subtitles = info_dict.get('subtitles')
2472
2473 info_dict['requested_subtitles'] = self.process_subtitles(
2474 info_dict['id'], subtitles, automatic_captions)
2475
2476 if info_dict.get('formats') is None:
2477 # There's only one format available
2478 formats = [info_dict]
2479 else:
2480 formats = info_dict['formats']
2481
2482 # or None ensures --clean-infojson removes it
2483 info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
2484 if not self.params.get('allow_unplayable_formats'):
2485 formats = [f for f in formats if not f.get('has_drm')]
2486 if info_dict['_has_drm'] and all(
2487 f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
2488 self.report_warning(
2489 'This video is DRM protected and only images are available for download. '
2490 'Use --list-formats to see them')
2491
2492 get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
2493 if not get_from_start:
2494 info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
2495 if info_dict.get('is_live') and formats:
2496 formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
2497 if get_from_start and not formats:
2498 self.raise_no_formats(info_dict, msg=(
2499 '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
2500 'If you want to download from the current time, use --no-live-from-start'))
2501
2502 if not formats:
2503 self.raise_no_formats(info_dict)
2504
2505 def is_wellformed(f):
2506 url = f.get('url')
2507 if not url:
2508 self.report_warning(
2509 '"url" field is missing or empty - skipping format, '
2510 'there is an error in extractor')
2511 return False
2512 if isinstance(url, bytes):
2513 sanitize_string_field(f, 'url')
2514 return True
2515
2516 # Filter out malformed formats for better extraction robustness
2517 formats = list(filter(is_wellformed, formats))
2518
2519 formats_dict = {}
2520
2521 # We check that all the formats have the format and format_id fields
2522 for i, format in enumerate(formats):
2523 sanitize_string_field(format, 'format_id')
2524 sanitize_numeric_fields(format)
2525 format['url'] = sanitize_url(format['url'])
2526 if not format.get('format_id'):
2527 format['format_id'] = str(i)
2528 else:
2529 # Sanitize format_id from characters used in format selector expression
2530 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2531 format_id = format['format_id']
2532 if format_id not in formats_dict:
2533 formats_dict[format_id] = []
2534 formats_dict[format_id].append(format)
2535
2536 # Make sure all formats have unique format_id
2537 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2538 for format_id, ambiguous_formats in formats_dict.items():
2539 ambigious_id = len(ambiguous_formats) > 1
2540 for i, format in enumerate(ambiguous_formats):
2541 if ambigious_id:
2542 format['format_id'] = '%s-%d' % (format_id, i)
2543 if format.get('ext') is None:
2544 format['ext'] = determine_ext(format['url']).lower()
2545 # Ensure there is no conflict between id and ext in format selection
2546 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2547 if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
2548 format['format_id'] = 'f%s' % format['format_id']
2549
2550 for i, format in enumerate(formats):
2551 if format.get('format') is None:
2552 format['format'] = '{id} - {res}{note}'.format(
2553 id=format['format_id'],
2554 res=self.format_resolution(format),
2555 note=format_field(format, 'format_note', ' (%s)'),
2556 )
2557 if format.get('protocol') is None:
2558 format['protocol'] = determine_protocol(format)
2559 if format.get('resolution') is None:
2560 format['resolution'] = self.format_resolution(format, default=None)
2561 if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
2562 format['dynamic_range'] = 'SDR'
2563 if (info_dict.get('duration') and format.get('tbr')
2564 and not format.get('filesize') and not format.get('filesize_approx')):
2565 format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
2566
2567 # Add HTTP headers, so that external programs can use them from the
2568 # json output
2569 full_format_info = info_dict.copy()
2570 full_format_info.update(format)
2571 format['http_headers'] = self._calc_headers(full_format_info)
2572 # Remove private housekeeping stuff
2573 if '__x_forwarded_for_ip' in info_dict:
2574 del info_dict['__x_forwarded_for_ip']
2575
2576 if self.params.get('check_formats') is True:
2577 formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
2578
2579 if not formats or formats[0] is not info_dict:
2580 # only set the 'formats' fields if the original info_dict list them
2581 # otherwise we end up with a circular reference, the first (and unique)
2582 # element in the 'formats' field in info_dict is info_dict itself,
2583 # which can't be exported to json
2584 info_dict['formats'] = formats
2585
2586 info_dict, _ = self.pre_process(info_dict)
2587
2588 if self._match_entry(info_dict, incomplete=self._format_fields) is not None:
2589 return info_dict
2590
2591 self.post_extract(info_dict)
2592 info_dict, _ = self.pre_process(info_dict, 'after_filter')
2593
2594 # The pre-processors may have modified the formats
2595 formats = info_dict.get('formats', [info_dict])
2596
2597 list_only = self.params.get('simulate') is None and (
2598 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2599 interactive_format_selection = not list_only and self.format_selector == '-'
2600 if self.params.get('list_thumbnails'):
2601 self.list_thumbnails(info_dict)
2602 if self.params.get('listsubtitles'):
2603 if 'automatic_captions' in info_dict:
2604 self.list_subtitles(
2605 info_dict['id'], automatic_captions, 'automatic captions')
2606 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2607 if self.params.get('listformats') or interactive_format_selection:
2608 self.list_formats(info_dict)
2609 if list_only:
2610 # Without this printing, -F --print-json will not work
2611 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2612 return info_dict
2613
2614 format_selector = self.format_selector
2615 if format_selector is None:
2616 req_format = self._default_format_spec(info_dict, download=download)
2617 self.write_debug('Default format spec: %s' % req_format)
2618 format_selector = self.build_format_selector(req_format)
2619
2620 while True:
2621 if interactive_format_selection:
2622 req_format = input(
2623 self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS))
2624 try:
2625 format_selector = self.build_format_selector(req_format)
2626 except SyntaxError as err:
2627 self.report_error(err, tb=False, is_error=False)
2628 continue
2629
2630 formats_to_download = list(format_selector({
2631 'formats': formats,
2632 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
2633 'incomplete_formats': (
2634 # All formats are video-only or
2635 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2636 # all formats are audio-only
2637 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)),
2638 }))
2639 if interactive_format_selection and not formats_to_download:
2640 self.report_error('Requested format is not available', tb=False, is_error=False)
2641 continue
2642 break
2643
2644 if not formats_to_download:
2645 if not self.params.get('ignore_no_formats_error'):
2646 raise ExtractorError(
2647 'Requested format is not available. Use --list-formats for a list of available formats',
2648 expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
2649 self.report_warning('Requested format is not available')
2650 # Process what we can, even without any available formats.
2651 formats_to_download = [{}]
2652
2653 requested_ranges = self.params.get('download_ranges')
2654 if requested_ranges:
2655 requested_ranges = tuple(requested_ranges(info_dict, self))
2656
2657 best_format, downloaded_formats = formats_to_download[-1], []
2658 if download:
2659 if best_format:
2660 def to_screen(*msg):
2661 self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
2662
2663 to_screen(f'Downloading {len(formats_to_download)} format(s):',
2664 (f['format_id'] for f in formats_to_download))
2665 if requested_ranges:
2666 to_screen(f'Downloading {len(requested_ranges)} time ranges:',
2667 (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges))
2668 max_downloads_reached = False
2669
2670 for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]):
2671 new_info = self._copy_infodict(info_dict)
2672 new_info.update(fmt)
2673 offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
2674 if chapter or offset:
2675 new_info.update({
2676 'section_start': offset + chapter.get('start_time', 0),
2677 'section_end': offset + min(chapter.get('end_time', duration), duration),
2678 'section_title': chapter.get('title'),
2679 'section_number': chapter.get('index'),
2680 })
2681 downloaded_formats.append(new_info)
2682 try:
2683 self.process_info(new_info)
2684 except MaxDownloadsReached:
2685 max_downloads_reached = True
2686 self._raise_pending_errors(new_info)
2687 # Remove copied info
2688 for key, val in tuple(new_info.items()):
2689 if info_dict.get(key) == val:
2690 new_info.pop(key)
2691 if max_downloads_reached:
2692 break
2693
2694 write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats}
2695 assert write_archive.issubset({True, False, 'ignore'})
2696 if True in write_archive and False not in write_archive:
2697 self.record_download_archive(info_dict)
2698
2699 info_dict['requested_downloads'] = downloaded_formats
2700 info_dict = self.run_all_pps('after_video', info_dict)
2701 if max_downloads_reached:
2702 raise MaxDownloadsReached()
2703
2704 # We update the info dict with the selected best quality format (backwards compatibility)
2705 info_dict.update(best_format)
2706 return info_dict
2707
2708 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2709 """Select the requested subtitles and their format"""
2710 available_subs, normal_sub_langs = {}, []
2711 if normal_subtitles and self.params.get('writesubtitles'):
2712 available_subs.update(normal_subtitles)
2713 normal_sub_langs = tuple(normal_subtitles.keys())
2714 if automatic_captions and self.params.get('writeautomaticsub'):
2715 for lang, cap_info in automatic_captions.items():
2716 if lang not in available_subs:
2717 available_subs[lang] = cap_info
2718
2719 if (not self.params.get('writesubtitles') and not
2720 self.params.get('writeautomaticsub') or not
2721 available_subs):
2722 return None
2723
2724 all_sub_langs = tuple(available_subs.keys())
2725 if self.params.get('allsubtitles', False):
2726 requested_langs = all_sub_langs
2727 elif self.params.get('subtitleslangs', False):
2728 # A list is used so that the order of languages will be the same as
2729 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2730 requested_langs = []
2731 for lang_re in self.params.get('subtitleslangs'):
2732 discard = lang_re[0] == '-'
2733 if discard:
2734 lang_re = lang_re[1:]
2735 if lang_re == 'all':
2736 if discard:
2737 requested_langs = []
2738 else:
2739 requested_langs.extend(all_sub_langs)
2740 continue
2741 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2742 if discard:
2743 for lang in current_langs:
2744 while lang in requested_langs:
2745 requested_langs.remove(lang)
2746 else:
2747 requested_langs.extend(current_langs)
2748 requested_langs = orderedSet(requested_langs)
2749 elif normal_sub_langs:
2750 requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
2751 else:
2752 requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1]
2753 if requested_langs:
2754 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2755
2756 formats_query = self.params.get('subtitlesformat', 'best')
2757 formats_preference = formats_query.split('/') if formats_query else []
2758 subs = {}
2759 for lang in requested_langs:
2760 formats = available_subs.get(lang)
2761 if formats is None:
2762 self.report_warning(f'{lang} subtitles not available for {video_id}')
2763 continue
2764 for ext in formats_preference:
2765 if ext == 'best':
2766 f = formats[-1]
2767 break
2768 matches = list(filter(lambda f: f['ext'] == ext, formats))
2769 if matches:
2770 f = matches[-1]
2771 break
2772 else:
2773 f = formats[-1]
2774 self.report_warning(
2775 'No subtitle format found matching "%s" for language %s, '
2776 'using %s' % (formats_query, lang, f['ext']))
2777 subs[lang] = f
2778 return subs
2779
2780 def _forceprint(self, key, info_dict):
2781 if info_dict is None:
2782 return
2783 info_copy = info_dict.copy()
2784 info_copy['formats_table'] = self.render_formats_table(info_dict)
2785 info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
2786 info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
2787 info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
2788
2789 def format_tmpl(tmpl):
2790 mobj = re.match(r'\w+(=?)$', tmpl)
2791 if mobj and mobj.group(1):
2792 return f'{tmpl[:-1]} = %({tmpl[:-1]})r'
2793 elif mobj:
2794 return f'%({tmpl})s'
2795 return tmpl
2796
2797 for tmpl in self.params['forceprint'].get(key, []):
2798 self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
2799
2800 for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
2801 filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
2802 tmpl = format_tmpl(tmpl)
2803 self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
2804 if self._ensure_dir_exists(filename):
2805 with open(filename, 'a', encoding='utf-8') as f:
2806 f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
2807
2808 def __forced_printings(self, info_dict, filename, incomplete):
2809 def print_mandatory(field, actual_field=None):
2810 if actual_field is None:
2811 actual_field = field
2812 if (self.params.get('force%s' % field, False)
2813 and (not incomplete or info_dict.get(actual_field) is not None)):
2814 self.to_stdout(info_dict[actual_field])
2815
2816 def print_optional(field):
2817 if (self.params.get('force%s' % field, False)
2818 and info_dict.get(field) is not None):
2819 self.to_stdout(info_dict[field])
2820
2821 info_dict = info_dict.copy()
2822 if filename is not None:
2823 info_dict['filename'] = filename
2824 if info_dict.get('requested_formats') is not None:
2825 # For RTMP URLs, also include the playpath
2826 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2827 elif info_dict.get('url'):
2828 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2829
2830 if (self.params.get('forcejson')
2831 or self.params['forceprint'].get('video')
2832 or self.params['print_to_file'].get('video')):
2833 self.post_extract(info_dict)
2834 self._forceprint('video', info_dict)
2835
2836 print_mandatory('title')
2837 print_mandatory('id')
2838 print_mandatory('url', 'urls')
2839 print_optional('thumbnail')
2840 print_optional('description')
2841 print_optional('filename')
2842 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2843 self.to_stdout(formatSeconds(info_dict['duration']))
2844 print_mandatory('format')
2845
2846 if self.params.get('forcejson'):
2847 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2848
2849 def dl(self, name, info, subtitle=False, test=False):
2850 if not info.get('url'):
2851 self.raise_no_formats(info, True)
2852
2853 if test:
2854 verbose = self.params.get('verbose')
2855 params = {
2856 'test': True,
2857 'quiet': self.params.get('quiet') or not verbose,
2858 'verbose': verbose,
2859 'noprogress': not verbose,
2860 'nopart': True,
2861 'skip_unavailable_fragments': False,
2862 'keep_fragments': False,
2863 'overwrites': True,
2864 '_no_ytdl_file': True,
2865 }
2866 else:
2867 params = self.params
2868 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2869 if not test:
2870 for ph in self._progress_hooks:
2871 fd.add_progress_hook(ph)
2872 urls = '", "'.join(
2873 (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
2874 for f in info.get('requested_formats', []) or [info])
2875 self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"')
2876
2877 # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
2878 # But it may contain objects that are not deep-copyable
2879 new_info = self._copy_infodict(info)
2880 if new_info.get('http_headers') is None:
2881 new_info['http_headers'] = self._calc_headers(new_info)
2882 return fd.download(name, new_info, subtitle)
2883
2884 def existing_file(self, filepaths, *, default_overwrite=True):
2885 existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
2886 if existing_files and not self.params.get('overwrites', default_overwrite):
2887 return existing_files[0]
2888
2889 for file in existing_files:
2890 self.report_file_delete(file)
2891 os.remove(file)
2892 return None
2893
2894 def process_info(self, info_dict):
2895 """Process a single resolved IE result. (Modifies it in-place)"""
2896
2897 assert info_dict.get('_type', 'video') == 'video'
2898 original_infodict = info_dict
2899
2900 if 'format' not in info_dict and 'ext' in info_dict:
2901 info_dict['format'] = info_dict['ext']
2902
2903 # This is mostly just for backward compatibility of process_info
2904 # As a side-effect, this allows for format-specific filters
2905 if self._match_entry(info_dict) is not None:
2906 info_dict['__write_download_archive'] = 'ignore'
2907 return
2908
2909 # Does nothing under normal operation - for backward compatibility of process_info
2910 self.post_extract(info_dict)
2911 self._num_downloads += 1
2912
2913 # info_dict['_filename'] needs to be set for backward compatibility
2914 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2915 temp_filename = self.prepare_filename(info_dict, 'temp')
2916 files_to_move = {}
2917
2918 # Forced printings
2919 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2920
2921 def check_max_downloads():
2922 if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'):
2923 raise MaxDownloadsReached()
2924
2925 if self.params.get('simulate'):
2926 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
2927 check_max_downloads()
2928 return
2929
2930 if full_filename is None:
2931 return
2932 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2933 return
2934 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2935 return
2936
2937 if self._write_description('video', info_dict,
2938 self.prepare_filename(info_dict, 'description')) is None:
2939 return
2940
2941 sub_files = self._write_subtitles(info_dict, temp_filename)
2942 if sub_files is None:
2943 return
2944 files_to_move.update(dict(sub_files))
2945
2946 thumb_files = self._write_thumbnails(
2947 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2948 if thumb_files is None:
2949 return
2950 files_to_move.update(dict(thumb_files))
2951
2952 infofn = self.prepare_filename(info_dict, 'infojson')
2953 _infojson_written = self._write_info_json('video', info_dict, infofn)
2954 if _infojson_written:
2955 info_dict['infojson_filename'] = infofn
2956 # For backward compatibility, even though it was a private field
2957 info_dict['__infojson_filename'] = infofn
2958 elif _infojson_written is None:
2959 return
2960
2961 # Note: Annotations are deprecated
2962 annofn = None
2963 if self.params.get('writeannotations', False):
2964 annofn = self.prepare_filename(info_dict, 'annotation')
2965 if annofn:
2966 if not self._ensure_dir_exists(encodeFilename(annofn)):
2967 return
2968 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2969 self.to_screen('[info] Video annotations are already present')
2970 elif not info_dict.get('annotations'):
2971 self.report_warning('There are no annotations to write.')
2972 else:
2973 try:
2974 self.to_screen('[info] Writing video annotations to: ' + annofn)
2975 with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2976 annofile.write(info_dict['annotations'])
2977 except (KeyError, TypeError):
2978 self.report_warning('There are no annotations to write.')
2979 except OSError:
2980 self.report_error('Cannot write annotations file: ' + annofn)
2981 return
2982
2983 # Write internet shortcut files
2984 def _write_link_file(link_type):
2985 url = try_get(info_dict['webpage_url'], iri_to_uri)
2986 if not url:
2987 self.report_warning(
2988 f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
2989 return True
2990 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
2991 if not self._ensure_dir_exists(encodeFilename(linkfn)):
2992 return False
2993 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2994 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
2995 return True
2996 try:
2997 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
2998 with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
2999 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
3000 template_vars = {'url': url}
3001 if link_type == 'desktop':
3002 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
3003 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
3004 except OSError:
3005 self.report_error(f'Cannot write internet shortcut {linkfn}')
3006 return False
3007 return True
3008
3009 write_links = {
3010 'url': self.params.get('writeurllink'),
3011 'webloc': self.params.get('writewebloclink'),
3012 'desktop': self.params.get('writedesktoplink'),
3013 }
3014 if self.params.get('writelink'):
3015 link_type = ('webloc' if sys.platform == 'darwin'
3016 else 'desktop' if sys.platform.startswith('linux')
3017 else 'url')
3018 write_links[link_type] = True
3019
3020 if any(should_write and not _write_link_file(link_type)
3021 for link_type, should_write in write_links.items()):
3022 return
3023
3024 def replace_info_dict(new_info):
3025 nonlocal info_dict
3026 if new_info == info_dict:
3027 return
3028 info_dict.clear()
3029 info_dict.update(new_info)
3030
3031 new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
3032 replace_info_dict(new_info)
3033
3034 if self.params.get('skip_download'):
3035 info_dict['filepath'] = temp_filename
3036 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
3037 info_dict['__files_to_move'] = files_to_move
3038 replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
3039 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
3040 else:
3041 # Download
3042 info_dict.setdefault('__postprocessors', [])
3043 try:
3044
3045 def existing_video_file(*filepaths):
3046 ext = info_dict.get('ext')
3047 converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
3048 file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
3049 default_overwrite=False)
3050 if file:
3051 info_dict['ext'] = os.path.splitext(file)[1][1:]
3052 return file
3053
3054 fd, success = None, True
3055 if info_dict.get('protocol') or info_dict.get('url'):
3056 fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
3057 if fd is not FFmpegFD and (
3058 info_dict.get('section_start') or info_dict.get('section_end')):
3059 msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
3060 else 'You have requested downloading the video partially, but ffmpeg is not installed')
3061 self.report_error(f'{msg}. Aborting')
3062 return
3063
3064 if info_dict.get('requested_formats') is not None:
3065
3066 def compatible_formats(formats):
3067 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
3068 video_formats = [format for format in formats if format.get('vcodec') != 'none']
3069 audio_formats = [format for format in formats if format.get('acodec') != 'none']
3070 if len(video_formats) > 2 or len(audio_formats) > 2:
3071 return False
3072
3073 # Check extension
3074 exts = {format.get('ext') for format in formats}
3075 COMPATIBLE_EXTS = (
3076 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'},
3077 {'webm'},
3078 )
3079 for ext_sets in COMPATIBLE_EXTS:
3080 if ext_sets.issuperset(exts):
3081 return True
3082 # TODO: Check acodec/vcodec
3083 return False
3084
3085 requested_formats = info_dict['requested_formats']
3086 old_ext = info_dict['ext']
3087 if self.params.get('merge_output_format') is None:
3088 if not compatible_formats(requested_formats):
3089 info_dict['ext'] = 'mkv'
3090 self.report_warning(
3091 'Requested formats are incompatible for merge and will be merged into mkv')
3092 if (info_dict['ext'] == 'webm'
3093 and info_dict.get('thumbnails')
3094 # check with type instead of pp_key, __name__, or isinstance
3095 # since we dont want any custom PPs to trigger this
3096 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721
3097 info_dict['ext'] = 'mkv'
3098 self.report_warning(
3099 'webm doesn\'t support embedding a thumbnail, mkv will be used')
3100 new_ext = info_dict['ext']
3101
3102 def correct_ext(filename, ext=new_ext):
3103 if filename == '-':
3104 return filename
3105 filename_real_ext = os.path.splitext(filename)[1][1:]
3106 filename_wo_ext = (
3107 os.path.splitext(filename)[0]
3108 if filename_real_ext in (old_ext, new_ext)
3109 else filename)
3110 return f'{filename_wo_ext}.{ext}'
3111
3112 # Ensure filename always has a correct extension for successful merge
3113 full_filename = correct_ext(full_filename)
3114 temp_filename = correct_ext(temp_filename)
3115 dl_filename = existing_video_file(full_filename, temp_filename)
3116 info_dict['__real_download'] = False
3117
3118 merger = FFmpegMergerPP(self)
3119 downloaded = []
3120 if dl_filename is not None:
3121 self.report_file_already_downloaded(dl_filename)
3122 elif fd:
3123 for f in requested_formats if fd != FFmpegFD else []:
3124 f['filepath'] = fname = prepend_extension(
3125 correct_ext(temp_filename, info_dict['ext']),
3126 'f%s' % f['format_id'], info_dict['ext'])
3127 downloaded.append(fname)
3128 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
3129 success, real_download = self.dl(temp_filename, info_dict)
3130 info_dict['__real_download'] = real_download
3131 else:
3132 if self.params.get('allow_unplayable_formats'):
3133 self.report_warning(
3134 'You have requested merging of multiple formats '
3135 'while also allowing unplayable formats to be downloaded. '
3136 'The formats won\'t be merged to prevent data corruption.')
3137 elif not merger.available:
3138 msg = 'You have requested merging of multiple formats but ffmpeg is not installed'
3139 if not self.params.get('ignoreerrors'):
3140 self.report_error(f'{msg}. Aborting due to --abort-on-error')
3141 return
3142 self.report_warning(f'{msg}. The formats won\'t be merged')
3143
3144 if temp_filename == '-':
3145 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
3146 else 'but the formats are incompatible for simultaneous download' if merger.available
3147 else 'but ffmpeg is not installed')
3148 self.report_warning(
3149 f'You have requested downloading multiple formats to stdout {reason}. '
3150 'The formats will be streamed one after the other')
3151 fname = temp_filename
3152 for f in requested_formats:
3153 new_info = dict(info_dict)
3154 del new_info['requested_formats']
3155 new_info.update(f)
3156 if temp_filename != '-':
3157 fname = prepend_extension(
3158 correct_ext(temp_filename, new_info['ext']),
3159 'f%s' % f['format_id'], new_info['ext'])
3160 if not self._ensure_dir_exists(fname):
3161 return
3162 f['filepath'] = fname
3163 downloaded.append(fname)
3164 partial_success, real_download = self.dl(fname, new_info)
3165 info_dict['__real_download'] = info_dict['__real_download'] or real_download
3166 success = success and partial_success
3167
3168 if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
3169 info_dict['__postprocessors'].append(merger)
3170 info_dict['__files_to_merge'] = downloaded
3171 # Even if there were no downloads, it is being merged only now
3172 info_dict['__real_download'] = True
3173 else:
3174 for file in downloaded:
3175 files_to_move[file] = None
3176 else:
3177 # Just a single file
3178 dl_filename = existing_video_file(full_filename, temp_filename)
3179 if dl_filename is None or dl_filename == temp_filename:
3180 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
3181 # So we should try to resume the download
3182 success, real_download = self.dl(temp_filename, info_dict)
3183 info_dict['__real_download'] = real_download
3184 else:
3185 self.report_file_already_downloaded(dl_filename)
3186
3187 dl_filename = dl_filename or temp_filename
3188 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
3189
3190 except network_exceptions as err:
3191 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
3192 return
3193 except OSError as err:
3194 raise UnavailableVideoError(err)
3195 except (ContentTooShortError, ) as err:
3196 self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})')
3197 return
3198
3199 self._raise_pending_errors(info_dict)
3200 if success and full_filename != '-':
3201
3202 def fixup():
3203 do_fixup = True
3204 fixup_policy = self.params.get('fixup')
3205 vid = info_dict['id']
3206
3207 if fixup_policy in ('ignore', 'never'):
3208 return
3209 elif fixup_policy == 'warn':
3210 do_fixup = 'warn'
3211 elif fixup_policy != 'force':
3212 assert fixup_policy in ('detect_or_warn', None)
3213 if not info_dict.get('__real_download'):
3214 do_fixup = False
3215
3216 def ffmpeg_fixup(cndn, msg, cls):
3217 if not (do_fixup and cndn):
3218 return
3219 elif do_fixup == 'warn':
3220 self.report_warning(f'{vid}: {msg}')
3221 return
3222 pp = cls(self)
3223 if pp.available:
3224 info_dict['__postprocessors'].append(pp)
3225 else:
3226 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
3227
3228 stretched_ratio = info_dict.get('stretched_ratio')
3229 ffmpeg_fixup(stretched_ratio not in (1, None),
3230 f'Non-uniform pixel ratio {stretched_ratio}',
3231 FFmpegFixupStretchedPP)
3232
3233 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
3234 downloader = downloader.FD_NAME if downloader else None
3235
3236 ext = info_dict.get('ext')
3237 postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any((
3238 isinstance(pp, FFmpegVideoConvertorPP)
3239 and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None)
3240 ) for pp in self._pps['post_process'])
3241
3242 if not postprocessed_by_ffmpeg:
3243 ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash',
3244 'writing DASH m4a. Only some players support this container',
3245 FFmpegFixupM4aPP)
3246 ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')
3247 or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
3248 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
3249 FFmpegFixupM3u8PP)
3250 ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
3251 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
3252
3253 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
3254 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP)
3255
3256 fixup()
3257 try:
3258 replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
3259 except PostProcessingError as err:
3260 self.report_error('Postprocessing: %s' % str(err))
3261 return
3262 try:
3263 for ph in self._post_hooks:
3264 ph(info_dict['filepath'])
3265 except Exception as err:
3266 self.report_error('post hooks: %s' % str(err))
3267 return
3268 info_dict['__write_download_archive'] = True
3269
3270 assert info_dict is original_infodict # Make sure the info_dict was modified in-place
3271 if self.params.get('force_write_download_archive'):
3272 info_dict['__write_download_archive'] = True
3273 check_max_downloads()
3274
3275 def __download_wrapper(self, func):
3276 @functools.wraps(func)
3277 def wrapper(*args, **kwargs):
3278 try:
3279 res = func(*args, **kwargs)
3280 except UnavailableVideoError as e:
3281 self.report_error(e)
3282 except DownloadCancelled as e:
3283 self.to_screen(f'[info] {e}')
3284 if not self.params.get('break_per_url'):
3285 raise
3286 else:
3287 if self.params.get('dump_single_json', False):
3288 self.post_extract(res)
3289 self.to_stdout(json.dumps(self.sanitize_info(res)))
3290 return wrapper
3291
3292 def download(self, url_list):
3293 """Download a given list of URLs."""
3294 url_list = variadic(url_list) # Passing a single URL is a common mistake
3295 outtmpl = self.params['outtmpl']['default']
3296 if (len(url_list) > 1
3297 and outtmpl != '-'
3298 and '%' not in outtmpl
3299 and self.params.get('max_downloads') != 1):
3300 raise SameFileError(outtmpl)
3301
3302 for url in url_list:
3303 self.__download_wrapper(self.extract_info)(
3304 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
3305
3306 return self._download_retcode
3307
3308 def download_with_info_file(self, info_filename):
3309 with contextlib.closing(fileinput.FileInput(
3310 [info_filename], mode='r',
3311 openhook=fileinput.hook_encoded('utf-8'))) as f:
3312 # FileInput doesn't have a read method, we can't call json.load
3313 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
3314 try:
3315 self.__download_wrapper(self.process_ie_result)(info, download=True)
3316 except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
3317 if not isinstance(e, EntryNotInPlaylist):
3318 self.to_stderr('\r')
3319 webpage_url = info.get('webpage_url')
3320 if webpage_url is not None:
3321 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
3322 return self.download([webpage_url])
3323 else:
3324 raise
3325 return self._download_retcode
3326
3327 @staticmethod
3328 def sanitize_info(info_dict, remove_private_keys=False):
3329 ''' Sanitize the infodict for converting to json '''
3330 if info_dict is None:
3331 return info_dict
3332 info_dict.setdefault('epoch', int(time.time()))
3333 info_dict.setdefault('_type', 'video')
3334
3335 if remove_private_keys:
3336 reject = lambda k, v: v is None or k.startswith('__') or k in {
3337 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
3338 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber',
3339 }
3340 else:
3341 reject = lambda k, v: False
3342
3343 def filter_fn(obj):
3344 if isinstance(obj, dict):
3345 return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
3346 elif isinstance(obj, (list, tuple, set, LazyList)):
3347 return list(map(filter_fn, obj))
3348 elif obj is None or isinstance(obj, (str, int, float, bool)):
3349 return obj
3350 else:
3351 return repr(obj)
3352
3353 return filter_fn(info_dict)
3354
3355 @staticmethod
3356 def filter_requested_info(info_dict, actually_filter=True):
3357 ''' Alias of sanitize_info for backward compatibility '''
3358 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3359
3360 def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
3361 for filename in set(filter(None, files_to_delete)):
3362 if msg:
3363 self.to_screen(msg % filename)
3364 try:
3365 os.remove(filename)
3366 except OSError:
3367 self.report_warning(f'Unable to delete file {filename}')
3368 if filename in info.get('__files_to_move', []): # NB: Delete even if None
3369 del info['__files_to_move'][filename]
3370
3371 @staticmethod
3372 def post_extract(info_dict):
3373 def actual_post_extract(info_dict):
3374 if info_dict.get('_type') in ('playlist', 'multi_video'):
3375 for video_dict in info_dict.get('entries', {}):
3376 actual_post_extract(video_dict or {})
3377 return
3378
3379 post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {})
3380 info_dict.update(post_extractor())
3381
3382 actual_post_extract(info_dict or {})
3383
3384 def run_pp(self, pp, infodict):
3385 files_to_delete = []
3386 if '__files_to_move' not in infodict:
3387 infodict['__files_to_move'] = {}
3388 try:
3389 files_to_delete, infodict = pp.run(infodict)
3390 except PostProcessingError as e:
3391 # Must be True and not 'only_download'
3392 if self.params.get('ignoreerrors') is True:
3393 self.report_error(e)
3394 return infodict
3395 raise
3396
3397 if not files_to_delete:
3398 return infodict
3399 if self.params.get('keepvideo', False):
3400 for f in files_to_delete:
3401 infodict['__files_to_move'].setdefault(f, '')
3402 else:
3403 self._delete_downloaded_files(
3404 *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
3405 return infodict
3406
3407 def run_all_pps(self, key, info, *, additional_pps=None):
3408 self._forceprint(key, info)
3409 for pp in (additional_pps or []) + self._pps[key]:
3410 info = self.run_pp(pp, info)
3411 return info
3412
3413 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3414 info = dict(ie_info)
3415 info['__files_to_move'] = files_to_move or {}
3416 try:
3417 info = self.run_all_pps(key, info)
3418 except PostProcessingError as err:
3419 msg = f'Preprocessing: {err}'
3420 info.setdefault('__pending_error', msg)
3421 self.report_error(msg, is_error=False)
3422 return info, info.pop('__files_to_move', None)
3423
3424 def post_process(self, filename, info, files_to_move=None):
3425 """Run all the postprocessors on the given file."""
3426 info['filepath'] = filename
3427 info['__files_to_move'] = files_to_move or {}
3428 info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
3429 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3430 del info['__files_to_move']
3431 return self.run_all_pps('after_move', info)
3432
3433 def _make_archive_id(self, info_dict):
3434 video_id = info_dict.get('id')
3435 if not video_id:
3436 return
3437 # Future-proof against any change in case
3438 # and backwards compatibility with prior versions
3439 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3440 if extractor is None:
3441 url = str_or_none(info_dict.get('url'))
3442 if not url:
3443 return
3444 # Try to find matching extractor for the URL and take its ie_key
3445 for ie_key, ie in self._ies.items():
3446 if ie.suitable(url):
3447 extractor = ie_key
3448 break
3449 else:
3450 return
3451 return f'{extractor.lower()} {video_id}'
3452
3453 def in_download_archive(self, info_dict):
3454 fn = self.params.get('download_archive')
3455 if fn is None:
3456 return False
3457
3458 vid_id = self._make_archive_id(info_dict)
3459 if not vid_id:
3460 return False # Incomplete video information
3461
3462 return vid_id in self.archive
3463
3464 def record_download_archive(self, info_dict):
3465 fn = self.params.get('download_archive')
3466 if fn is None:
3467 return
3468 vid_id = self._make_archive_id(info_dict)
3469 assert vid_id
3470 self.write_debug(f'Adding to archive: {vid_id}')
3471 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3472 archive_file.write(vid_id + '\n')
3473 self.archive.add(vid_id)
3474
3475 @staticmethod
3476 def format_resolution(format, default='unknown'):
3477 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3478 return 'audio only'
3479 if format.get('resolution') is not None:
3480 return format['resolution']
3481 if format.get('width') and format.get('height'):
3482 return '%dx%d' % (format['width'], format['height'])
3483 elif format.get('height'):
3484 return '%sp' % format['height']
3485 elif format.get('width'):
3486 return '%dx?' % format['width']
3487 return default
3488
3489 def _list_format_headers(self, *headers):
3490 if self.params.get('listformats_table', True) is not False:
3491 return [self._format_out(header, self.Styles.HEADERS) for header in headers]
3492 return headers
3493
3494 def _format_note(self, fdict):
3495 res = ''
3496 if fdict.get('ext') in ['f4f', 'f4m']:
3497 res += '(unsupported)'
3498 if fdict.get('language'):
3499 if res:
3500 res += ' '
3501 res += '[%s]' % fdict['language']
3502 if fdict.get('format_note') is not None:
3503 if res:
3504 res += ' '
3505 res += fdict['format_note']
3506 if fdict.get('tbr') is not None:
3507 if res:
3508 res += ', '
3509 res += '%4dk' % fdict['tbr']
3510 if fdict.get('container') is not None:
3511 if res:
3512 res += ', '
3513 res += '%s container' % fdict['container']
3514 if (fdict.get('vcodec') is not None
3515 and fdict.get('vcodec') != 'none'):
3516 if res:
3517 res += ', '
3518 res += fdict['vcodec']
3519 if fdict.get('vbr') is not None:
3520 res += '@'
3521 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3522 res += 'video@'
3523 if fdict.get('vbr') is not None:
3524 res += '%4dk' % fdict['vbr']
3525 if fdict.get('fps') is not None:
3526 if res:
3527 res += ', '
3528 res += '%sfps' % fdict['fps']
3529 if fdict.get('acodec') is not None:
3530 if res:
3531 res += ', '
3532 if fdict['acodec'] == 'none':
3533 res += 'video only'
3534 else:
3535 res += '%-5s' % fdict['acodec']
3536 elif fdict.get('abr') is not None:
3537 if res:
3538 res += ', '
3539 res += 'audio'
3540 if fdict.get('abr') is not None:
3541 res += '@%3dk' % fdict['abr']
3542 if fdict.get('asr') is not None:
3543 res += ' (%5dHz)' % fdict['asr']
3544 if fdict.get('filesize') is not None:
3545 if res:
3546 res += ', '
3547 res += format_bytes(fdict['filesize'])
3548 elif fdict.get('filesize_approx') is not None:
3549 if res:
3550 res += ', '
3551 res += '~' + format_bytes(fdict['filesize_approx'])
3552 return res
3553
3554 def render_formats_table(self, info_dict):
3555 if not info_dict.get('formats') and not info_dict.get('url'):
3556 return None
3557
3558 formats = info_dict.get('formats', [info_dict])
3559 if not self.params.get('listformats_table', True) is not False:
3560 table = [
3561 [
3562 format_field(f, 'format_id'),
3563 format_field(f, 'ext'),
3564 self.format_resolution(f),
3565 self._format_note(f)
3566 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3567 return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
3568
3569 def simplified_codec(f, field):
3570 assert field in ('acodec', 'vcodec')
3571 codec = f.get(field, 'unknown')
3572 if not codec:
3573 return 'unknown'
3574 elif codec != 'none':
3575 return '.'.join(codec.split('.')[:4])
3576
3577 if field == 'vcodec' and f.get('acodec') == 'none':
3578 return 'images'
3579 elif field == 'acodec' and f.get('vcodec') == 'none':
3580 return ''
3581 return self._format_out('audio only' if field == 'vcodec' else 'video only',
3582 self.Styles.SUPPRESS)
3583
3584 delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3585 table = [
3586 [
3587 self._format_out(format_field(f, 'format_id'), self.Styles.ID),
3588 format_field(f, 'ext'),
3589 format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
3590 format_field(f, 'fps', '\t%d', func=round),
3591 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3592 delim,
3593 format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
3594 format_field(f, 'tbr', '\t%dk', func=round),
3595 shorten_protocol_name(f.get('protocol', '')),
3596 delim,
3597 simplified_codec(f, 'vcodec'),
3598 format_field(f, 'vbr', '\t%dk', func=round),
3599 simplified_codec(f, 'acodec'),
3600 format_field(f, 'abr', '\t%dk', func=round),
3601 format_field(f, 'asr', '\t%s', func=format_decimal_suffix),
3602 join_nonempty(
3603 self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
3604 format_field(f, 'language', '[%s]'),
3605 join_nonempty(format_field(f, 'format_note'),
3606 format_field(f, 'container', ignore=(None, f.get('ext'))),
3607 delim=', '),
3608 delim=' '),
3609 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3610 header_line = self._list_format_headers(
3611 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
3612 delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
3613
3614 return render_table(
3615 header_line, table, hide_empty=True,
3616 delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True))
3617
3618 def render_thumbnails_table(self, info_dict):
3619 thumbnails = list(info_dict.get('thumbnails') or [])
3620 if not thumbnails:
3621 return None
3622 return render_table(
3623 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3624 [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
3625
3626 def render_subtitles_table(self, video_id, subtitles):
3627 def _row(lang, formats):
3628 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3629 if len(set(names)) == 1:
3630 names = [] if names[0] == 'unknown' else names[:1]
3631 return [lang, ', '.join(names), ', '.join(exts)]
3632
3633 if not subtitles:
3634 return None
3635 return render_table(
3636 self._list_format_headers('Language', 'Name', 'Formats'),
3637 [_row(lang, formats) for lang, formats in subtitles.items()],
3638 hide_empty=True)
3639
3640 def __list_table(self, video_id, name, func, *args):
3641 table = func(*args)
3642 if not table:
3643 self.to_screen(f'{video_id} has no {name}')
3644 return
3645 self.to_screen(f'[info] Available {name} for {video_id}:')
3646 self.to_stdout(table)
3647
3648 def list_formats(self, info_dict):
3649 self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
3650
3651 def list_thumbnails(self, info_dict):
3652 self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
3653
3654 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3655 self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
3656
3657 def urlopen(self, req):
3658 """ Start an HTTP download """
3659 if isinstance(req, str):
3660 req = sanitized_Request(req)
3661 return self._opener.open(req, timeout=self._socket_timeout)
3662
3663 def print_debug_header(self):
3664 if not self.params.get('verbose'):
3665 return
3666
3667 # These imports can be slow. So import them only as needed
3668 from .extractor.extractors import _LAZY_LOADER
3669 from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors
3670
3671 def get_encoding(stream):
3672 ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
3673 if not supports_terminal_sequences(stream):
3674 from .utils import WINDOWS_VT_MODE # Must be imported locally
3675 ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
3676 return ret
3677
3678 encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % (
3679 locale.getpreferredencoding(),
3680 sys.getfilesystemencoding(),
3681 self.get_encoding(),
3682 ', '.join(
3683 f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_
3684 if stream is not None and key != 'console')
3685 )
3686
3687 logger = self.params.get('logger')
3688 if logger:
3689 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
3690 write_debug(encoding_str)
3691 else:
3692 write_string(f'[debug] {encoding_str}\n', encoding=None)
3693 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
3694
3695 source = detect_variant()
3696 write_debug(join_nonempty(
3697 'yt-dlp version', __version__,
3698 f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
3699 '' if source == 'unknown' else f'({source})',
3700 delim=' '))
3701 if not _LAZY_LOADER:
3702 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
3703 write_debug('Lazy loading extractors is forcibly disabled')
3704 else:
3705 write_debug('Lazy loading extractors is disabled')
3706 if plugin_extractors or plugin_postprocessors:
3707 write_debug('Plugins: %s' % [
3708 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3709 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3710 if self.params['compat_opts']:
3711 write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts']))
3712
3713 if source == 'source':
3714 try:
3715 stdout, _, _ = Popen.run(
3716 ['git', 'rev-parse', '--short', 'HEAD'],
3717 text=True, cwd=os.path.dirname(os.path.abspath(__file__)),
3718 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3719 if re.fullmatch('[0-9a-f]+', stdout.strip()):
3720 write_debug(f'Git HEAD: {stdout.strip()}')
3721 except Exception:
3722 with contextlib.suppress(Exception):
3723 sys.exc_clear()
3724
3725 write_debug(system_identifier())
3726
3727 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
3728 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
3729 if ffmpeg_features:
3730 exe_versions['ffmpeg'] += ' (%s)' % ','.join(sorted(ffmpeg_features))
3731
3732 exe_versions['rtmpdump'] = rtmpdump_version()
3733 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3734 exe_str = ', '.join(
3735 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3736 ) or 'none'
3737 write_debug('exe versions: %s' % exe_str)
3738
3739 from .compat.compat_utils import get_package_info
3740 from .dependencies import available_dependencies
3741
3742 write_debug('Optional libraries: %s' % (', '.join(sorted({
3743 join_nonempty(*get_package_info(m)) for m in available_dependencies.values()
3744 })) or 'none'))
3745
3746 self._setup_opener()
3747 proxy_map = {}
3748 for handler in self._opener.handlers:
3749 if hasattr(handler, 'proxies'):
3750 proxy_map.update(handler.proxies)
3751 write_debug(f'Proxy map: {proxy_map}')
3752
3753 # Not implemented
3754 if False and self.params.get('call_home'):
3755 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode()
3756 write_debug('Public IP address: %s' % ipaddr)
3757 latest_version = self.urlopen(
3758 'https://yt-dl.org/latest/version').read().decode()
3759 if version_tuple(latest_version) > version_tuple(__version__):
3760 self.report_warning(
3761 'You are using an outdated version (newest version: %s)! '
3762 'See https://yt-dl.org/update if you need help updating.' %
3763 latest_version)
3764
3765 def _setup_opener(self):
3766 if hasattr(self, '_opener'):
3767 return
3768 timeout_val = self.params.get('socket_timeout')
3769 self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
3770
3771 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3772 opts_cookiefile = self.params.get('cookiefile')
3773 opts_proxy = self.params.get('proxy')
3774
3775 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3776
3777 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3778 if opts_proxy is not None:
3779 if opts_proxy == '':
3780 proxies = {}
3781 else:
3782 proxies = {'http': opts_proxy, 'https': opts_proxy}
3783 else:
3784 proxies = urllib.request.getproxies()
3785 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3786 if 'http' in proxies and 'https' not in proxies:
3787 proxies['https'] = proxies['http']
3788 proxy_handler = PerRequestProxyHandler(proxies)
3789
3790 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3791 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3792 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3793 redirect_handler = YoutubeDLRedirectHandler()
3794 data_handler = urllib.request.DataHandler()
3795
3796 # When passing our own FileHandler instance, build_opener won't add the
3797 # default FileHandler and allows us to disable the file protocol, which
3798 # can be used for malicious purposes (see
3799 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3800 file_handler = urllib.request.FileHandler()
3801
3802 def file_open(*args, **kwargs):
3803 raise urllib.error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3804 file_handler.file_open = file_open
3805
3806 opener = urllib.request.build_opener(
3807 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3808
3809 # Delete the default user-agent header, which would otherwise apply in
3810 # cases where our custom HTTP handler doesn't come into play
3811 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3812 opener.addheaders = []
3813 self._opener = opener
3814
3815 def encode(self, s):
3816 if isinstance(s, bytes):
3817 return s # Already encoded
3818
3819 try:
3820 return s.encode(self.get_encoding())
3821 except UnicodeEncodeError as err:
3822 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3823 raise
3824
3825 def get_encoding(self):
3826 encoding = self.params.get('encoding')
3827 if encoding is None:
3828 encoding = preferredencoding()
3829 return encoding
3830
3831 def _write_info_json(self, label, ie_result, infofn, overwrite=None):
3832 ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error '''
3833 if overwrite is None:
3834 overwrite = self.params.get('overwrites', True)
3835 if not self.params.get('writeinfojson'):
3836 return False
3837 elif not infofn:
3838 self.write_debug(f'Skipping writing {label} infojson')
3839 return False
3840 elif not self._ensure_dir_exists(infofn):
3841 return None
3842 elif not overwrite and os.path.exists(infofn):
3843 self.to_screen(f'[info] {label.title()} metadata is already present')
3844 return 'exists'
3845
3846 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3847 try:
3848 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3849 return True
3850 except OSError:
3851 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3852 return None
3853
3854 def _write_description(self, label, ie_result, descfn):
3855 ''' Write description and returns True = written, False = skip, None = error '''
3856 if not self.params.get('writedescription'):
3857 return False
3858 elif not descfn:
3859 self.write_debug(f'Skipping writing {label} description')
3860 return False
3861 elif not self._ensure_dir_exists(descfn):
3862 return None
3863 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3864 self.to_screen(f'[info] {label.title()} description is already present')
3865 elif ie_result.get('description') is None:
3866 self.report_warning(f'There\'s no {label} description to write')
3867 return False
3868 else:
3869 try:
3870 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3871 with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3872 descfile.write(ie_result['description'])
3873 except OSError:
3874 self.report_error(f'Cannot write {label} description file {descfn}')
3875 return None
3876 return True
3877
3878 def _write_subtitles(self, info_dict, filename):
3879 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3880 ret = []
3881 subtitles = info_dict.get('requested_subtitles')
3882 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3883 # subtitles download errors are already managed as troubles in relevant IE
3884 # that way it will silently go on when used with unsupporting IE
3885 return ret
3886
3887 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3888 if not sub_filename_base:
3889 self.to_screen('[info] Skipping writing video subtitles')
3890 return ret
3891 for sub_lang, sub_info in subtitles.items():
3892 sub_format = sub_info['ext']
3893 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3894 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3895 existing_sub = self.existing_file((sub_filename_final, sub_filename))
3896 if existing_sub:
3897 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3898 sub_info['filepath'] = existing_sub
3899 ret.append((existing_sub, sub_filename_final))
3900 continue
3901
3902 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3903 if sub_info.get('data') is not None:
3904 try:
3905 # Use newline='' to prevent conversion of newline characters
3906 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3907 with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3908 subfile.write(sub_info['data'])
3909 sub_info['filepath'] = sub_filename
3910 ret.append((sub_filename, sub_filename_final))
3911 continue
3912 except OSError:
3913 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3914 return None
3915
3916 try:
3917 sub_copy = sub_info.copy()
3918 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3919 self.dl(sub_filename, sub_copy, subtitle=True)
3920 sub_info['filepath'] = sub_filename
3921 ret.append((sub_filename, sub_filename_final))
3922 except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3923 msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
3924 if self.params.get('ignoreerrors') is not True: # False or 'only_download'
3925 if not self.params.get('ignoreerrors'):
3926 self.report_error(msg)
3927 raise DownloadError(msg)
3928 self.report_warning(msg)
3929 return ret
3930
3931 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3932 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3933 write_all = self.params.get('write_all_thumbnails', False)
3934 thumbnails, ret = [], []
3935 if write_all or self.params.get('writethumbnail', False):
3936 thumbnails = info_dict.get('thumbnails') or []
3937 multiple = write_all and len(thumbnails) > 1
3938
3939 if thumb_filename_base is None:
3940 thumb_filename_base = filename
3941 if thumbnails and not thumb_filename_base:
3942 self.write_debug(f'Skipping writing {label} thumbnail')
3943 return ret
3944
3945 for idx, t in list(enumerate(thumbnails))[::-1]:
3946 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3947 thumb_display_id = f'{label} thumbnail {t["id"]}'
3948 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3949 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3950
3951 existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
3952 if existing_thumb:
3953 self.to_screen('[info] %s is already present' % (
3954 thumb_display_id if multiple else f'{label} thumbnail').capitalize())
3955 t['filepath'] = existing_thumb
3956 ret.append((existing_thumb, thumb_filename_final))
3957 else:
3958 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3959 try:
3960 uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {})))
3961 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3962 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3963 shutil.copyfileobj(uf, thumbf)
3964 ret.append((thumb_filename, thumb_filename_final))
3965 t['filepath'] = thumb_filename
3966 except network_exceptions as err:
3967 thumbnails.pop(idx)
3968 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3969 if ret and not write_all:
3970 break
3971 return ret