]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
6455b0df203e1a1c4744a36936aca14cd0bcd14b
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 import collections
2 import contextlib
3 import datetime
4 import errno
5 import fileinput
6 import functools
7 import io
8 import itertools
9 import json
10 import locale
11 import operator
12 import os
13 import random
14 import re
15 import shutil
16 import subprocess
17 import sys
18 import tempfile
19 import time
20 import tokenize
21 import traceback
22 import unicodedata
23 import urllib.request
24 from string import ascii_letters
25
26 from .cache import Cache
27 from .compat import HAS_LEGACY as compat_has_legacy
28 from .compat import compat_os_name, compat_shlex_quote
29 from .cookies import load_cookies
30 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
31 from .downloader.rtmp import rtmpdump_version
32 from .extractor import gen_extractor_classes, get_info_extractor
33 from .extractor.openload import PhantomJSwrapper
34 from .minicurses import format_text
35 from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
36 from .postprocessor import (
37 EmbedThumbnailPP,
38 FFmpegFixupDuplicateMoovPP,
39 FFmpegFixupDurationPP,
40 FFmpegFixupM3u8PP,
41 FFmpegFixupM4aPP,
42 FFmpegFixupStretchedPP,
43 FFmpegFixupTimestampPP,
44 FFmpegMergerPP,
45 FFmpegPostProcessor,
46 MoveFilesAfterDownloadPP,
47 get_postprocessor,
48 )
49 from .update import detect_variant
50 from .utils import (
51 DEFAULT_OUTTMPL,
52 IDENTITY,
53 LINK_TEMPLATES,
54 NO_DEFAULT,
55 NUMBER_RE,
56 OUTTMPL_TYPES,
57 POSTPROCESS_WHEN,
58 STR_FORMAT_RE_TMPL,
59 STR_FORMAT_TYPES,
60 ContentTooShortError,
61 DateRange,
62 DownloadCancelled,
63 DownloadError,
64 EntryNotInPlaylist,
65 ExistingVideoReached,
66 ExtractorError,
67 GeoRestrictedError,
68 HEADRequest,
69 ISO3166Utils,
70 LazyList,
71 MaxDownloadsReached,
72 Namespace,
73 PagedList,
74 PerRequestProxyHandler,
75 PlaylistEntries,
76 Popen,
77 PostProcessingError,
78 ReExtractInfo,
79 RejectedVideoReached,
80 SameFileError,
81 UnavailableVideoError,
82 YoutubeDLCookieProcessor,
83 YoutubeDLHandler,
84 YoutubeDLRedirectHandler,
85 age_restricted,
86 args_to_str,
87 date_from_str,
88 determine_ext,
89 determine_protocol,
90 encode_compat_str,
91 encodeFilename,
92 error_to_compat_str,
93 escapeHTML,
94 expand_path,
95 filter_dict,
96 float_or_none,
97 format_bytes,
98 format_decimal_suffix,
99 format_field,
100 formatSeconds,
101 get_domain,
102 int_or_none,
103 iri_to_uri,
104 join_nonempty,
105 locked_file,
106 make_dir,
107 make_HTTPS_handler,
108 merge_headers,
109 network_exceptions,
110 number_of_digits,
111 orderedSet,
112 parse_filesize,
113 preferredencoding,
114 prepend_extension,
115 register_socks_protocols,
116 remove_terminal_sequences,
117 render_table,
118 replace_extension,
119 sanitize_filename,
120 sanitize_path,
121 sanitize_url,
122 sanitized_Request,
123 std_headers,
124 str_or_none,
125 strftime_or_none,
126 subtitles_filename,
127 supports_terminal_sequences,
128 system_identifier,
129 timetuple_from_msec,
130 to_high_limit_path,
131 traverse_obj,
132 try_get,
133 url_basename,
134 variadic,
135 version_tuple,
136 windows_enable_vt_mode,
137 write_json_file,
138 write_string,
139 )
140 from .version import RELEASE_GIT_HEAD, __version__
141
142 if compat_os_name == 'nt':
143 import ctypes
144
145
146 class YoutubeDL:
147 """YoutubeDL class.
148
149 YoutubeDL objects are the ones responsible of downloading the
150 actual video file and writing it to disk if the user has requested
151 it, among some other tasks. In most cases there should be one per
152 program. As, given a video URL, the downloader doesn't know how to
153 extract all the needed information, task that InfoExtractors do, it
154 has to pass the URL to one of them.
155
156 For this, YoutubeDL objects have a method that allows
157 InfoExtractors to be registered in a given order. When it is passed
158 a URL, the YoutubeDL object handles it to the first InfoExtractor it
159 finds that reports being able to handle it. The InfoExtractor extracts
160 all the information about the video or videos the URL refers to, and
161 YoutubeDL process the extracted information, possibly using a File
162 Downloader to download the video.
163
164 YoutubeDL objects accept a lot of parameters. In order not to saturate
165 the object constructor with arguments, it receives a dictionary of
166 options instead. These options are available through the params
167 attribute for the InfoExtractors to use. The YoutubeDL also
168 registers itself as the downloader in charge for the InfoExtractors
169 that are added to it, so this is a "mutual registration".
170
171 Available options:
172
173 username: Username for authentication purposes.
174 password: Password for authentication purposes.
175 videopassword: Password for accessing a video.
176 ap_mso: Adobe Pass multiple-system operator identifier.
177 ap_username: Multiple-system operator account username.
178 ap_password: Multiple-system operator account password.
179 usenetrc: Use netrc for authentication instead.
180 verbose: Print additional info to stdout.
181 quiet: Do not print messages to stdout.
182 no_warnings: Do not print out anything for warnings.
183 forceprint: A dict with keys WHEN mapped to a list of templates to
184 print to stdout. The allowed keys are video or any of the
185 items in utils.POSTPROCESS_WHEN.
186 For compatibility, a single list is also accepted
187 print_to_file: A dict with keys WHEN (same as forceprint) mapped to
188 a list of tuples with (template, filename)
189 forcejson: Force printing info_dict as JSON.
190 dump_single_json: Force printing the info_dict of the whole playlist
191 (or video) as a single JSON line.
192 force_write_download_archive: Force writing download archive regardless
193 of 'skip_download' or 'simulate'.
194 simulate: Do not download the video files. If unset (or None),
195 simulate only if listsubtitles, listformats or list_thumbnails is used
196 format: Video format code. see "FORMAT SELECTION" for more details.
197 You can also pass a function. The function takes 'ctx' as
198 argument and returns the formats to download.
199 See "build_format_selector" for an implementation
200 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
201 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
202 extracting metadata even if the video is not actually
203 available for download (experimental)
204 format_sort: A list of fields by which to sort the video formats.
205 See "Sorting Formats" for more details.
206 format_sort_force: Force the given format_sort. see "Sorting Formats"
207 for more details.
208 prefer_free_formats: Whether to prefer video formats with free containers
209 over non-free ones of same quality.
210 allow_multiple_video_streams: Allow multiple video streams to be merged
211 into a single file
212 allow_multiple_audio_streams: Allow multiple audio streams to be merged
213 into a single file
214 check_formats Whether to test if the formats are downloadable.
215 Can be True (check all), False (check none),
216 'selected' (check selected formats),
217 or None (check only if requested by extractor)
218 paths: Dictionary of output paths. The allowed keys are 'home'
219 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
220 outtmpl: Dictionary of templates for output names. Allowed keys
221 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
222 For compatibility with youtube-dl, a single string can also be used
223 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
224 restrictfilenames: Do not allow "&" and spaces in file names
225 trim_file_name: Limit length of filename (extension excluded)
226 windowsfilenames: Force the filenames to be windows compatible
227 ignoreerrors: Do not stop on download/postprocessing errors.
228 Can be 'only_download' to ignore only download errors.
229 Default is 'only_download' for CLI, but False for API
230 skip_playlist_after_errors: Number of allowed failures until the rest of
231 the playlist is skipped
232 force_generic_extractor: Force downloader to use the generic extractor
233 overwrites: Overwrite all video and metadata files if True,
234 overwrite only non-video files if None
235 and don't overwrite any file if False
236 For compatibility with youtube-dl,
237 "nooverwrites" may also be used instead
238 playlist_items: Specific indices of playlist to download.
239 playlistrandom: Download playlist items in random order.
240 lazy_playlist: Process playlist entries as they are received.
241 matchtitle: Download only matching titles.
242 rejecttitle: Reject downloads for matching titles.
243 logger: Log messages to a logging.Logger instance.
244 logtostderr: Log messages to stderr instead of stdout.
245 consoletitle: Display progress in console window's titlebar.
246 writedescription: Write the video description to a .description file
247 writeinfojson: Write the video description to a .info.json file
248 clean_infojson: Remove private fields from the infojson
249 getcomments: Extract video comments. This will not be written to disk
250 unless writeinfojson is also given
251 writeannotations: Write the video annotations to a .annotations.xml file
252 writethumbnail: Write the thumbnail image to a file
253 allow_playlist_files: Whether to write playlists' description, infojson etc
254 also to disk when using the 'write*' options
255 write_all_thumbnails: Write all thumbnail formats to files
256 writelink: Write an internet shortcut file, depending on the
257 current platform (.url/.webloc/.desktop)
258 writeurllink: Write a Windows internet shortcut file (.url)
259 writewebloclink: Write a macOS internet shortcut file (.webloc)
260 writedesktoplink: Write a Linux internet shortcut file (.desktop)
261 writesubtitles: Write the video subtitles to a file
262 writeautomaticsub: Write the automatically generated subtitles to a file
263 listsubtitles: Lists all available subtitles for the video
264 subtitlesformat: The format code for subtitles
265 subtitleslangs: List of languages of the subtitles to download (can be regex).
266 The list may contain "all" to refer to all the available
267 subtitles. The language can be prefixed with a "-" to
268 exclude it from the requested languages. Eg: ['all', '-live_chat']
269 keepvideo: Keep the video file after post-processing
270 daterange: A DateRange object, download only if the upload_date is in the range.
271 skip_download: Skip the actual download of the video file
272 cachedir: Location of the cache files in the filesystem.
273 False to disable filesystem cache.
274 noplaylist: Download single video instead of a playlist if in doubt.
275 age_limit: An integer representing the user's age in years.
276 Unsuitable videos for the given age are skipped.
277 min_views: An integer representing the minimum view count the video
278 must have in order to not be skipped.
279 Videos without view count information are always
280 downloaded. None for no limit.
281 max_views: An integer representing the maximum view count.
282 Videos that are more popular than that are not
283 downloaded.
284 Videos without view count information are always
285 downloaded. None for no limit.
286 download_archive: File name of a file where all downloads are recorded.
287 Videos already present in the file are not downloaded
288 again.
289 break_on_existing: Stop the download process after attempting to download a
290 file that is in the archive.
291 break_on_reject: Stop the download process when encountering a video that
292 has been filtered out.
293 break_per_url: Whether break_on_reject and break_on_existing
294 should act on each input URL as opposed to for the entire queue
295 cookiefile: File name or text stream from where cookies should be read and dumped to
296 cookiesfrombrowser: A tuple containing the name of the browser, the profile
297 name/pathfrom where cookies are loaded, and the name of the
298 keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
299 legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
300 support RFC 5746 secure renegotiation
301 nocheckcertificate: Do not verify SSL certificates
302 client_certificate: Path to client certificate file in PEM format. May include the private key
303 client_certificate_key: Path to private key file for client certificate
304 client_certificate_password: Password for client certificate private key, if encrypted.
305 If not provided and the key is encrypted, yt-dlp will ask interactively
306 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
307 At the moment, this is only supported by YouTube.
308 http_headers: A dictionary of custom headers to be used for all requests
309 proxy: URL of the proxy server to use
310 geo_verification_proxy: URL of the proxy to use for IP address verification
311 on geo-restricted sites.
312 socket_timeout: Time to wait for unresponsive hosts, in seconds
313 bidi_workaround: Work around buggy terminals without bidirectional text
314 support, using fridibi
315 debug_printtraffic:Print out sent and received HTTP traffic
316 default_search: Prepend this string if an input url is not valid.
317 'auto' for elaborate guessing
318 encoding: Use this encoding instead of the system-specified.
319 extract_flat: Do not resolve URLs, return the immediate result.
320 Pass in 'in_playlist' to only show this behavior for
321 playlist items.
322 wait_for_video: If given, wait for scheduled streams to become available.
323 The value should be a tuple containing the range
324 (min_secs, max_secs) to wait between retries
325 postprocessors: A list of dictionaries, each with an entry
326 * key: The name of the postprocessor. See
327 yt_dlp/postprocessor/__init__.py for a list.
328 * when: When to run the postprocessor. Allowed values are
329 the entries of utils.POSTPROCESS_WHEN
330 Assumed to be 'post_process' if not given
331 progress_hooks: A list of functions that get called on download
332 progress, with a dictionary with the entries
333 * status: One of "downloading", "error", or "finished".
334 Check this first and ignore unknown values.
335 * info_dict: The extracted info_dict
336
337 If status is one of "downloading", or "finished", the
338 following properties may also be present:
339 * filename: The final filename (always present)
340 * tmpfilename: The filename we're currently writing to
341 * downloaded_bytes: Bytes on disk
342 * total_bytes: Size of the whole file, None if unknown
343 * total_bytes_estimate: Guess of the eventual file size,
344 None if unavailable.
345 * elapsed: The number of seconds since download started.
346 * eta: The estimated time in seconds, None if unknown
347 * speed: The download speed in bytes/second, None if
348 unknown
349 * fragment_index: The counter of the currently
350 downloaded video fragment.
351 * fragment_count: The number of fragments (= individual
352 files that will be merged)
353
354 Progress hooks are guaranteed to be called at least once
355 (with status "finished") if the download is successful.
356 postprocessor_hooks: A list of functions that get called on postprocessing
357 progress, with a dictionary with the entries
358 * status: One of "started", "processing", or "finished".
359 Check this first and ignore unknown values.
360 * postprocessor: Name of the postprocessor
361 * info_dict: The extracted info_dict
362
363 Progress hooks are guaranteed to be called at least twice
364 (with status "started" and "finished") if the processing is successful.
365 merge_output_format: Extension to use when merging formats.
366 final_ext: Expected final extension; used to detect when the file was
367 already downloaded and converted
368 fixup: Automatically correct known faults of the file.
369 One of:
370 - "never": do nothing
371 - "warn": only emit a warning
372 - "detect_or_warn": check whether we can do anything
373 about it, warn otherwise (default)
374 source_address: Client-side IP address to bind to.
375 sleep_interval_requests: Number of seconds to sleep between requests
376 during extraction
377 sleep_interval: Number of seconds to sleep before each download when
378 used alone or a lower bound of a range for randomized
379 sleep before each download (minimum possible number
380 of seconds to sleep) when used along with
381 max_sleep_interval.
382 max_sleep_interval:Upper bound of a range for randomized sleep before each
383 download (maximum possible number of seconds to sleep).
384 Must only be used along with sleep_interval.
385 Actual sleep time will be a random float from range
386 [sleep_interval; max_sleep_interval].
387 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
388 listformats: Print an overview of available video formats and exit.
389 list_thumbnails: Print a table of all thumbnails and exit.
390 match_filter: A function that gets called for every video with the signature
391 (info_dict, *, incomplete: bool) -> Optional[str]
392 For backward compatibility with youtube-dl, the signature
393 (info_dict) -> Optional[str] is also allowed.
394 - If it returns a message, the video is ignored.
395 - If it returns None, the video is downloaded.
396 - If it returns utils.NO_DEFAULT, the user is interactively
397 asked whether to download the video.
398 match_filter_func in utils.py is one example for this.
399 no_color: Do not emit color codes in output.
400 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
401 HTTP header
402 geo_bypass_country:
403 Two-letter ISO 3166-2 country code that will be used for
404 explicit geographic restriction bypassing via faking
405 X-Forwarded-For HTTP header
406 geo_bypass_ip_block:
407 IP range in CIDR notation that will be used similarly to
408 geo_bypass_country
409 external_downloader: A dictionary of protocol keys and the executable of the
410 external downloader to use for it. The allowed protocols
411 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
412 Set the value to 'native' to use the native downloader
413 compat_opts: Compatibility options. See "Differences in default behavior".
414 The following options do not work when used through the API:
415 filename, abort-on-error, multistreams, no-live-chat, format-sort
416 no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json.
417 Refer __init__.py for their implementation
418 progress_template: Dictionary of templates for progress outputs.
419 Allowed keys are 'download', 'postprocess',
420 'download-title' (console title) and 'postprocess-title'.
421 The template is mapped on a dictionary with keys 'progress' and 'info'
422 retry_sleep_functions: Dictionary of functions that takes the number of attempts
423 as argument and returns the time to sleep in seconds.
424 Allowed keys are 'http', 'fragment', 'file_access'
425 download_ranges: A function that gets called for every video with the signature
426 (info_dict, *, ydl) -> Iterable[Section].
427 Only the returned sections will be downloaded. Each Section contains:
428 * start_time: Start time of the section in seconds
429 * end_time: End time of the section in seconds
430 * title: Section title (Optional)
431 * index: Section number (Optional)
432
433 The following parameters are not used by YoutubeDL itself, they are used by
434 the downloader (see yt_dlp/downloader/common.py):
435 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
436 max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
437 continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
438 external_downloader_args, concurrent_fragment_downloads.
439
440 The following options are used by the post processors:
441 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
442 to the binary or its containing directory.
443 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
444 and a list of additional command-line arguments for the
445 postprocessor/executable. The dict can also have "PP+EXE" keys
446 which are used when the given exe is used by the given PP.
447 Use 'default' as the name for arguments to passed to all PP
448 For compatibility with youtube-dl, a single list of args
449 can also be used
450
451 The following options are used by the extractors:
452 extractor_retries: Number of times to retry for known errors
453 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
454 hls_split_discontinuity: Split HLS playlists to different formats at
455 discontinuities such as ad breaks (default: False)
456 extractor_args: A dictionary of arguments to be passed to the extractors.
457 See "EXTRACTOR ARGUMENTS" for details.
458 Eg: {'youtube': {'skip': ['dash', 'hls']}}
459 mark_watched: Mark videos watched (even with --simulate). Only for YouTube
460
461 The following options are deprecated and may be removed in the future:
462
463 playliststart: - Use playlist_items
464 Playlist item to start at.
465 playlistend: - Use playlist_items
466 Playlist item to end at.
467 playlistreverse: - Use playlist_items
468 Download playlist items in reverse order.
469 forceurl: - Use forceprint
470 Force printing final URL.
471 forcetitle: - Use forceprint
472 Force printing title.
473 forceid: - Use forceprint
474 Force printing ID.
475 forcethumbnail: - Use forceprint
476 Force printing thumbnail URL.
477 forcedescription: - Use forceprint
478 Force printing description.
479 forcefilename: - Use forceprint
480 Force printing final filename.
481 forceduration: - Use forceprint
482 Force printing duration.
483 allsubtitles: - Use subtitleslangs = ['all']
484 Downloads all the subtitles of the video
485 (requires writesubtitles or writeautomaticsub)
486 include_ads: - Doesn't work
487 Download ads as well
488 call_home: - Not implemented
489 Boolean, true iff we are allowed to contact the
490 yt-dlp servers for debugging.
491 post_hooks: - Register a custom postprocessor
492 A list of functions that get called as the final step
493 for each video file, after all postprocessors have been
494 called. The filename will be passed as the only argument.
495 hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}.
496 Use the native HLS downloader instead of ffmpeg/avconv
497 if True, otherwise use ffmpeg/avconv if False, otherwise
498 use downloader suggested by extractor if None.
499 prefer_ffmpeg: - avconv support is deprecated
500 If False, use avconv instead of ffmpeg if both are available,
501 otherwise prefer ffmpeg.
502 youtube_include_dash_manifest: - Use extractor_args
503 If True (default), DASH manifests and related
504 data will be downloaded and processed by extractor.
505 You can reduce network I/O by disabling it if you don't
506 care about DASH. (only for youtube)
507 youtube_include_hls_manifest: - Use extractor_args
508 If True (default), HLS manifests and related
509 data will be downloaded and processed by extractor.
510 You can reduce network I/O by disabling it if you don't
511 care about HLS. (only for youtube)
512 """
513
514 _NUMERIC_FIELDS = {
515 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
516 'timestamp', 'release_timestamp',
517 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
518 'average_rating', 'comment_count', 'age_limit',
519 'start_time', 'end_time',
520 'chapter_number', 'season_number', 'episode_number',
521 'track_number', 'disc_number', 'release_year',
522 }
523
524 _format_fields = {
525 # NB: Keep in sync with the docstring of extractor/common.py
526 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
527 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr',
528 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
529 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
530 'preference', 'language', 'language_preference', 'quality', 'source_preference',
531 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
532 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
533 }
534 _format_selection_exts = {
535 'audio': {'m4a', 'mp3', 'ogg', 'aac'},
536 'video': {'mp4', 'flv', 'webm', '3gp'},
537 'storyboards': {'mhtml'},
538 }
539
540 def __init__(self, params=None, auto_init=True):
541 """Create a FileDownloader object with the given options.
542 @param auto_init Whether to load the default extractors and print header (if verbose).
543 Set to 'no_verbose_header' to not print the header
544 """
545 if params is None:
546 params = {}
547 self.params = params
548 self._ies = {}
549 self._ies_instances = {}
550 self._pps = {k: [] for k in POSTPROCESS_WHEN}
551 self._printed_messages = set()
552 self._first_webpage_request = True
553 self._post_hooks = []
554 self._progress_hooks = []
555 self._postprocessor_hooks = []
556 self._download_retcode = 0
557 self._num_downloads = 0
558 self._num_videos = 0
559 self._playlist_level = 0
560 self._playlist_urls = set()
561 self.cache = Cache(self)
562
563 windows_enable_vt_mode()
564 stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
565 self._out_files = Namespace(
566 out=stdout,
567 error=sys.stderr,
568 screen=sys.stderr if self.params.get('quiet') else stdout,
569 console=None if compat_os_name == 'nt' else next(
570 filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None)
571 )
572 self._allow_colors = Namespace(**{
573 type_: not self.params.get('no_color') and supports_terminal_sequences(stream)
574 for type_, stream in self._out_files.items_ if type_ != 'console'
575 })
576
577 MIN_SUPPORTED, MIN_RECOMMENDED = (3, 6), (3, 7)
578 current_version = sys.version_info[:2]
579 if current_version < MIN_RECOMMENDED:
580 msg = ('Support for Python version %d.%d has been deprecated. '
581 'See https://github.com/yt-dlp/yt-dlp/issues/3764 for more details. '
582 'You will recieve only one more update on this version')
583 if current_version < MIN_SUPPORTED:
584 msg = 'Python version %d.%d is no longer supported'
585 self.deprecation_warning(
586 f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED))
587
588 if self.params.get('allow_unplayable_formats'):
589 self.report_warning(
590 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
591 'This is a developer option intended for debugging. \n'
592 ' If you experience any issues while using this option, '
593 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
594
595 def check_deprecated(param, option, suggestion):
596 if self.params.get(param) is not None:
597 self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
598 return True
599 return False
600
601 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
602 if self.params.get('geo_verification_proxy') is None:
603 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
604
605 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
606 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
607 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
608
609 for msg in self.params.get('_warnings', []):
610 self.report_warning(msg)
611 for msg in self.params.get('_deprecation_warnings', []):
612 self.deprecation_warning(msg)
613
614 self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
615 if not compat_has_legacy:
616 self.params['compat_opts'].add('no-compat-legacy')
617 if 'list-formats' in self.params['compat_opts']:
618 self.params['listformats_table'] = False
619
620 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
621 # nooverwrites was unnecessarily changed to overwrites
622 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
623 # This ensures compatibility with both keys
624 self.params['overwrites'] = not self.params['nooverwrites']
625 elif self.params.get('overwrites') is None:
626 self.params.pop('overwrites', None)
627 else:
628 self.params['nooverwrites'] = not self.params['overwrites']
629
630 self.params.setdefault('forceprint', {})
631 self.params.setdefault('print_to_file', {})
632
633 # Compatibility with older syntax
634 if not isinstance(params['forceprint'], dict):
635 self.params['forceprint'] = {'video': params['forceprint']}
636
637 if self.params.get('bidi_workaround', False):
638 try:
639 import pty
640 master, slave = pty.openpty()
641 width = shutil.get_terminal_size().columns
642 width_args = [] if width is None else ['-w', str(width)]
643 sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
644 try:
645 self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
646 except OSError:
647 self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
648 self._output_channel = os.fdopen(master, 'rb')
649 except OSError as ose:
650 if ose.errno == errno.ENOENT:
651 self.report_warning(
652 'Could not find fribidi executable, ignoring --bidi-workaround. '
653 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
654 else:
655 raise
656
657 if auto_init:
658 if auto_init != 'no_verbose_header':
659 self.print_debug_header()
660 self.add_default_info_extractors()
661
662 if (sys.platform != 'win32'
663 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
664 and not self.params.get('restrictfilenames', False)):
665 # Unicode filesystem API will throw errors (#1474, #13027)
666 self.report_warning(
667 'Assuming --restrict-filenames since file system encoding '
668 'cannot encode all characters. '
669 'Set the LC_ALL environment variable to fix this.')
670 self.params['restrictfilenames'] = True
671
672 self._parse_outtmpl()
673
674 # Creating format selector here allows us to catch syntax errors before the extraction
675 self.format_selector = (
676 self.params.get('format') if self.params.get('format') in (None, '-')
677 else self.params['format'] if callable(self.params['format'])
678 else self.build_format_selector(self.params['format']))
679
680 # Set http_headers defaults according to std_headers
681 self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
682
683 hooks = {
684 'post_hooks': self.add_post_hook,
685 'progress_hooks': self.add_progress_hook,
686 'postprocessor_hooks': self.add_postprocessor_hook,
687 }
688 for opt, fn in hooks.items():
689 for ph in self.params.get(opt, []):
690 fn(ph)
691
692 for pp_def_raw in self.params.get('postprocessors', []):
693 pp_def = dict(pp_def_raw)
694 when = pp_def.pop('when', 'post_process')
695 self.add_post_processor(
696 get_postprocessor(pp_def.pop('key'))(self, **pp_def),
697 when=when)
698
699 self._setup_opener()
700 register_socks_protocols()
701
702 def preload_download_archive(fn):
703 """Preload the archive, if any is specified"""
704 if fn is None:
705 return False
706 self.write_debug(f'Loading archive file {fn!r}')
707 try:
708 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
709 for line in archive_file:
710 self.archive.add(line.strip())
711 except OSError as ioe:
712 if ioe.errno != errno.ENOENT:
713 raise
714 return False
715 return True
716
717 self.archive = set()
718 preload_download_archive(self.params.get('download_archive'))
719
720 def warn_if_short_id(self, argv):
721 # short YouTube ID starting with dash?
722 idxs = [
723 i for i, a in enumerate(argv)
724 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
725 if idxs:
726 correct_argv = (
727 ['yt-dlp']
728 + [a for i, a in enumerate(argv) if i not in idxs]
729 + ['--'] + [argv[i] for i in idxs]
730 )
731 self.report_warning(
732 'Long argument string detected. '
733 'Use -- to separate parameters and URLs, like this:\n%s' %
734 args_to_str(correct_argv))
735
736 def add_info_extractor(self, ie):
737 """Add an InfoExtractor object to the end of the list."""
738 ie_key = ie.ie_key()
739 self._ies[ie_key] = ie
740 if not isinstance(ie, type):
741 self._ies_instances[ie_key] = ie
742 ie.set_downloader(self)
743
744 def _get_info_extractor_class(self, ie_key):
745 ie = self._ies.get(ie_key)
746 if ie is None:
747 ie = get_info_extractor(ie_key)
748 self.add_info_extractor(ie)
749 return ie
750
751 def get_info_extractor(self, ie_key):
752 """
753 Get an instance of an IE with name ie_key, it will try to get one from
754 the _ies list, if there's no instance it will create a new one and add
755 it to the extractor list.
756 """
757 ie = self._ies_instances.get(ie_key)
758 if ie is None:
759 ie = get_info_extractor(ie_key)()
760 self.add_info_extractor(ie)
761 return ie
762
763 def add_default_info_extractors(self):
764 """
765 Add the InfoExtractors returned by gen_extractors to the end of the list
766 """
767 for ie in gen_extractor_classes():
768 self.add_info_extractor(ie)
769
770 def add_post_processor(self, pp, when='post_process'):
771 """Add a PostProcessor object to the end of the chain."""
772 assert when in POSTPROCESS_WHEN, f'Invalid when={when}'
773 self._pps[when].append(pp)
774 pp.set_downloader(self)
775
776 def add_post_hook(self, ph):
777 """Add the post hook"""
778 self._post_hooks.append(ph)
779
780 def add_progress_hook(self, ph):
781 """Add the download progress hook"""
782 self._progress_hooks.append(ph)
783
784 def add_postprocessor_hook(self, ph):
785 """Add the postprocessing progress hook"""
786 self._postprocessor_hooks.append(ph)
787 for pps in self._pps.values():
788 for pp in pps:
789 pp.add_progress_hook(ph)
790
791 def _bidi_workaround(self, message):
792 if not hasattr(self, '_output_channel'):
793 return message
794
795 assert hasattr(self, '_output_process')
796 assert isinstance(message, str)
797 line_count = message.count('\n') + 1
798 self._output_process.stdin.write((message + '\n').encode())
799 self._output_process.stdin.flush()
800 res = ''.join(self._output_channel.readline().decode()
801 for _ in range(line_count))
802 return res[:-len('\n')]
803
804 def _write_string(self, message, out=None, only_once=False):
805 if only_once:
806 if message in self._printed_messages:
807 return
808 self._printed_messages.add(message)
809 write_string(message, out=out, encoding=self.params.get('encoding'))
810
811 def to_stdout(self, message, skip_eol=False, quiet=None):
812 """Print message to stdout"""
813 if quiet is not None:
814 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead')
815 if skip_eol is not False:
816 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. Use "YoutubeDL.to_screen" instead')
817 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out)
818
819 def to_screen(self, message, skip_eol=False, quiet=None):
820 """Print message to screen if not in quiet mode"""
821 if self.params.get('logger'):
822 self.params['logger'].debug(message)
823 return
824 if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'):
825 return
826 self._write_string(
827 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
828 self._out_files.screen)
829
830 def to_stderr(self, message, only_once=False):
831 """Print message to stderr"""
832 assert isinstance(message, str)
833 if self.params.get('logger'):
834 self.params['logger'].error(message)
835 else:
836 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once)
837
838 def _send_console_code(self, code):
839 if compat_os_name == 'nt' or not self._out_files.console:
840 return
841 self._write_string(code, self._out_files.console)
842
843 def to_console_title(self, message):
844 if not self.params.get('consoletitle', False):
845 return
846 message = remove_terminal_sequences(message)
847 if compat_os_name == 'nt':
848 if ctypes.windll.kernel32.GetConsoleWindow():
849 # c_wchar_p() might not be necessary if `message` is
850 # already of type unicode()
851 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
852 else:
853 self._send_console_code(f'\033]0;{message}\007')
854
855 def save_console_title(self):
856 if not self.params.get('consoletitle') or self.params.get('simulate'):
857 return
858 self._send_console_code('\033[22;0t') # Save the title on stack
859
860 def restore_console_title(self):
861 if not self.params.get('consoletitle') or self.params.get('simulate'):
862 return
863 self._send_console_code('\033[23;0t') # Restore the title from stack
864
865 def __enter__(self):
866 self.save_console_title()
867 return self
868
869 def __exit__(self, *args):
870 self.restore_console_title()
871
872 if self.params.get('cookiefile') is not None:
873 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
874
875 def trouble(self, message=None, tb=None, is_error=True):
876 """Determine action to take when a download problem appears.
877
878 Depending on if the downloader has been configured to ignore
879 download errors or not, this method may throw an exception or
880 not when errors are found, after printing the message.
881
882 @param tb If given, is additional traceback information
883 @param is_error Whether to raise error according to ignorerrors
884 """
885 if message is not None:
886 self.to_stderr(message)
887 if self.params.get('verbose'):
888 if tb is None:
889 if sys.exc_info()[0]: # if .trouble has been called from an except block
890 tb = ''
891 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
892 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
893 tb += encode_compat_str(traceback.format_exc())
894 else:
895 tb_data = traceback.format_list(traceback.extract_stack())
896 tb = ''.join(tb_data)
897 if tb:
898 self.to_stderr(tb)
899 if not is_error:
900 return
901 if not self.params.get('ignoreerrors'):
902 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
903 exc_info = sys.exc_info()[1].exc_info
904 else:
905 exc_info = sys.exc_info()
906 raise DownloadError(message, exc_info)
907 self._download_retcode = 1
908
909 Styles = Namespace(
910 HEADERS='yellow',
911 EMPHASIS='light blue',
912 FILENAME='green',
913 ID='green',
914 DELIM='blue',
915 ERROR='red',
916 WARNING='yellow',
917 SUPPRESS='light black',
918 )
919
920 def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
921 text = str(text)
922 if test_encoding:
923 original_text = text
924 # handle.encoding can be None. See https://github.com/yt-dlp/yt-dlp/issues/2711
925 encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii'
926 text = text.encode(encoding, 'ignore').decode(encoding)
927 if fallback is not None and text != original_text:
928 text = fallback
929 return format_text(text, f) if allow_colors else text if fallback is None else fallback
930
931 def _format_out(self, *args, **kwargs):
932 return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs)
933
934 def _format_screen(self, *args, **kwargs):
935 return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs)
936
937 def _format_err(self, *args, **kwargs):
938 return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs)
939
940 def report_warning(self, message, only_once=False):
941 '''
942 Print the message to stderr, it will be prefixed with 'WARNING:'
943 If stderr is a tty file the 'WARNING:' will be colored
944 '''
945 if self.params.get('logger') is not None:
946 self.params['logger'].warning(message)
947 else:
948 if self.params.get('no_warnings'):
949 return
950 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
951
952 def deprecation_warning(self, message):
953 if self.params.get('logger') is not None:
954 self.params['logger'].warning(f'DeprecationWarning: {message}')
955 else:
956 self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True)
957
958 def report_error(self, message, *args, **kwargs):
959 '''
960 Do the same as trouble, but prefixes the message with 'ERROR:', colored
961 in red if stderr is a tty file.
962 '''
963 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
964
965 def write_debug(self, message, only_once=False):
966 '''Log debug message or Print message to stderr'''
967 if not self.params.get('verbose', False):
968 return
969 message = f'[debug] {message}'
970 if self.params.get('logger'):
971 self.params['logger'].debug(message)
972 else:
973 self.to_stderr(message, only_once)
974
975 def report_file_already_downloaded(self, file_name):
976 """Report file has already been fully downloaded."""
977 try:
978 self.to_screen('[download] %s has already been downloaded' % file_name)
979 except UnicodeEncodeError:
980 self.to_screen('[download] The file has already been downloaded')
981
982 def report_file_delete(self, file_name):
983 """Report that existing file will be deleted."""
984 try:
985 self.to_screen('Deleting existing file %s' % file_name)
986 except UnicodeEncodeError:
987 self.to_screen('Deleting existing file')
988
989 def raise_no_formats(self, info, forced=False, *, msg=None):
990 has_drm = info.get('_has_drm')
991 ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
992 msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!'
993 if forced or not ignored:
994 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
995 expected=has_drm or ignored or expected)
996 else:
997 self.report_warning(msg)
998
999 def parse_outtmpl(self):
1000 self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version')
1001 self._parse_outtmpl()
1002 return self.params['outtmpl']
1003
1004 def _parse_outtmpl(self):
1005 sanitize = IDENTITY
1006 if self.params.get('restrictfilenames'): # Remove spaces in the default template
1007 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
1008
1009 outtmpl = self.params.setdefault('outtmpl', {})
1010 if not isinstance(outtmpl, dict):
1011 self.params['outtmpl'] = outtmpl = {'default': outtmpl}
1012 outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None})
1013
1014 def get_output_path(self, dir_type='', filename=None):
1015 paths = self.params.get('paths', {})
1016 assert isinstance(paths, dict)
1017 path = os.path.join(
1018 expand_path(paths.get('home', '').strip()),
1019 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
1020 filename or '')
1021 return sanitize_path(path, force=self.params.get('windowsfilenames'))
1022
1023 @staticmethod
1024 def _outtmpl_expandpath(outtmpl):
1025 # expand_path translates '%%' into '%' and '$$' into '$'
1026 # correspondingly that is not what we want since we need to keep
1027 # '%%' intact for template dict substitution step. Working around
1028 # with boundary-alike separator hack.
1029 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
1030 outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$')
1031
1032 # outtmpl should be expand_path'ed before template dict substitution
1033 # because meta fields may contain env variables we don't want to
1034 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
1035 # title "Hello $PATH", we don't want `$PATH` to be expanded.
1036 return expand_path(outtmpl).replace(sep, '')
1037
1038 @staticmethod
1039 def escape_outtmpl(outtmpl):
1040 ''' Escape any remaining strings like %s, %abc% etc. '''
1041 return re.sub(
1042 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
1043 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
1044 outtmpl)
1045
1046 @classmethod
1047 def validate_outtmpl(cls, outtmpl):
1048 ''' @return None or Exception object '''
1049 outtmpl = re.sub(
1050 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'),
1051 lambda mobj: f'{mobj.group(0)[:-1]}s',
1052 cls._outtmpl_expandpath(outtmpl))
1053 try:
1054 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
1055 return None
1056 except ValueError as err:
1057 return err
1058
1059 @staticmethod
1060 def _copy_infodict(info_dict):
1061 info_dict = dict(info_dict)
1062 info_dict.pop('__postprocessors', None)
1063 info_dict.pop('__pending_error', None)
1064 return info_dict
1065
1066 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
1067 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
1068 @param sanitize Whether to sanitize the output as a filename.
1069 For backward compatibility, a function can also be passed
1070 """
1071
1072 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1073
1074 info_dict = self._copy_infodict(info_dict)
1075 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1076 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1077 if info_dict.get('duration', None) is not None
1078 else None)
1079 info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads)
1080 info_dict['video_autonumber'] = self._num_videos
1081 if info_dict.get('resolution') is None:
1082 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1083
1084 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1085 # of %(field)s to %(field)0Nd for backward compatibility
1086 field_size_compat_map = {
1087 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0),
1088 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1089 'autonumber': self.params.get('autonumber_size') or 5,
1090 }
1091
1092 TMPL_DICT = {}
1093 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]'))
1094 MATH_FUNCTIONS = {
1095 '+': float.__add__,
1096 '-': float.__sub__,
1097 }
1098 # Field is of the form key1.key2...
1099 # where keys (except first) can be string, int or slice
1100 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
1101 MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
1102 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
1103 INTERNAL_FORMAT_RE = re.compile(rf'''(?x)
1104 (?P<negate>-)?
1105 (?P<fields>{FIELD_RE})
1106 (?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
1107 (?:>(?P<strf_format>.+?))?
1108 (?P<remaining>
1109 (?P<alternate>(?<!\\),[^|&)]+)?
1110 (?:&(?P<replacement>.*?))?
1111 (?:\|(?P<default>.*?))?
1112 )$''')
1113
1114 def _traverse_infodict(k):
1115 k = k.split('.')
1116 if k[0] == '':
1117 k.pop(0)
1118 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
1119
1120 def get_value(mdict):
1121 # Object traversal
1122 value = _traverse_infodict(mdict['fields'])
1123 # Negative
1124 if mdict['negate']:
1125 value = float_or_none(value)
1126 if value is not None:
1127 value *= -1
1128 # Do maths
1129 offset_key = mdict['maths']
1130 if offset_key:
1131 value = float_or_none(value)
1132 operator = None
1133 while offset_key:
1134 item = re.match(
1135 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1136 offset_key).group(0)
1137 offset_key = offset_key[len(item):]
1138 if operator is None:
1139 operator = MATH_FUNCTIONS[item]
1140 continue
1141 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1142 offset = float_or_none(item)
1143 if offset is None:
1144 offset = float_or_none(_traverse_infodict(item))
1145 try:
1146 value = operator(value, multiplier * offset)
1147 except (TypeError, ZeroDivisionError):
1148 return None
1149 operator = None
1150 # Datetime formatting
1151 if mdict['strf_format']:
1152 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1153
1154 return value
1155
1156 na = self.params.get('outtmpl_na_placeholder', 'NA')
1157
1158 def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
1159 return sanitize_filename(str(value), restricted=restricted, is_id=(
1160 bool(re.search(r'(^|[_.])id(\.|$)', key))
1161 if 'filename-sanitization' in self.params['compat_opts']
1162 else NO_DEFAULT))
1163
1164 sanitizer = sanitize if callable(sanitize) else filename_sanitizer
1165 sanitize = bool(sanitize)
1166
1167 def _dumpjson_default(obj):
1168 if isinstance(obj, (set, LazyList)):
1169 return list(obj)
1170 return repr(obj)
1171
1172 def create_key(outer_mobj):
1173 if not outer_mobj.group('has_key'):
1174 return outer_mobj.group(0)
1175 key = outer_mobj.group('key')
1176 mobj = re.match(INTERNAL_FORMAT_RE, key)
1177 initial_field = mobj.group('fields') if mobj else ''
1178 value, replacement, default = None, None, na
1179 while mobj:
1180 mobj = mobj.groupdict()
1181 default = mobj['default'] if mobj['default'] is not None else default
1182 value = get_value(mobj)
1183 replacement = mobj['replacement']
1184 if value is None and mobj['alternate']:
1185 mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
1186 else:
1187 break
1188
1189 fmt = outer_mobj.group('format')
1190 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1191 fmt = f'0{field_size_compat_map[key]:d}d'
1192
1193 value = default if value is None else value if replacement is None else replacement
1194
1195 flags = outer_mobj.group('conversion') or ''
1196 str_fmt = f'{fmt[:-1]}s'
1197 if fmt[-1] == 'l': # list
1198 delim = '\n' if '#' in flags else ', '
1199 value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
1200 elif fmt[-1] == 'j': # json
1201 value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
1202 elif fmt[-1] == 'h': # html
1203 value, fmt = escapeHTML(value), str_fmt
1204 elif fmt[-1] == 'q': # quoted
1205 value = map(str, variadic(value) if '#' in flags else [value])
1206 value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
1207 elif fmt[-1] == 'B': # bytes
1208 value = f'%{str_fmt}'.encode() % str(value).encode()
1209 value, fmt = value.decode('utf-8', 'ignore'), 's'
1210 elif fmt[-1] == 'U': # unicode normalized
1211 value, fmt = unicodedata.normalize(
1212 # "+" = compatibility equivalence, "#" = NFD
1213 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1214 value), str_fmt
1215 elif fmt[-1] == 'D': # decimal suffix
1216 num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
1217 value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
1218 factor=1024 if '#' in flags else 1000)
1219 elif fmt[-1] == 'S': # filename sanitization
1220 value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
1221 elif fmt[-1] == 'c':
1222 if value:
1223 value = str(value)[0]
1224 else:
1225 fmt = str_fmt
1226 elif fmt[-1] not in 'rs': # numeric
1227 value = float_or_none(value)
1228 if value is None:
1229 value, fmt = default, 's'
1230
1231 if sanitize:
1232 if fmt[-1] == 'r':
1233 # If value is an object, sanitize might convert it to a string
1234 # So we convert it to repr first
1235 value, fmt = repr(value), str_fmt
1236 if fmt[-1] in 'csr':
1237 value = sanitizer(initial_field, value)
1238
1239 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1240 TMPL_DICT[key] = value
1241 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1242
1243 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1244
1245 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1246 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1247 return self.escape_outtmpl(outtmpl) % info_dict
1248
1249 def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
1250 assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
1251 if outtmpl is None:
1252 outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default'])
1253 try:
1254 outtmpl = self._outtmpl_expandpath(outtmpl)
1255 filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
1256 if not filename:
1257 return None
1258
1259 if tmpl_type in ('', 'temp'):
1260 final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
1261 if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
1262 filename = replace_extension(filename, ext, final_ext)
1263 elif tmpl_type:
1264 force_ext = OUTTMPL_TYPES[tmpl_type]
1265 if force_ext:
1266 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1267
1268 # https://github.com/blackjack4494/youtube-dlc/issues/85
1269 trim_file_name = self.params.get('trim_file_name', False)
1270 if trim_file_name:
1271 no_ext, *ext = filename.rsplit('.', 2)
1272 filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
1273
1274 return filename
1275 except ValueError as err:
1276 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1277 return None
1278
1279 def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
1280 """Generate the output filename"""
1281 if outtmpl:
1282 assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
1283 dir_type = None
1284 filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
1285 if not filename and dir_type not in ('', 'temp'):
1286 return ''
1287
1288 if warn:
1289 if not self.params.get('paths'):
1290 pass
1291 elif filename == '-':
1292 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1293 elif os.path.isabs(filename):
1294 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1295 if filename == '-' or not filename:
1296 return filename
1297
1298 return self.get_output_path(dir_type, filename)
1299
1300 def _match_entry(self, info_dict, incomplete=False, silent=False):
1301 """ Returns None if the file should be downloaded """
1302
1303 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1304
1305 def check_filter():
1306 if 'title' in info_dict:
1307 # This can happen when we're just evaluating the playlist
1308 title = info_dict['title']
1309 matchtitle = self.params.get('matchtitle', False)
1310 if matchtitle:
1311 if not re.search(matchtitle, title, re.IGNORECASE):
1312 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1313 rejecttitle = self.params.get('rejecttitle', False)
1314 if rejecttitle:
1315 if re.search(rejecttitle, title, re.IGNORECASE):
1316 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1317 date = info_dict.get('upload_date')
1318 if date is not None:
1319 dateRange = self.params.get('daterange', DateRange())
1320 if date not in dateRange:
1321 return f'{date_from_str(date).isoformat()} upload date is not in range {dateRange}'
1322 view_count = info_dict.get('view_count')
1323 if view_count is not None:
1324 min_views = self.params.get('min_views')
1325 if min_views is not None and view_count < min_views:
1326 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1327 max_views = self.params.get('max_views')
1328 if max_views is not None and view_count > max_views:
1329 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1330 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1331 return 'Skipping "%s" because it is age restricted' % video_title
1332
1333 match_filter = self.params.get('match_filter')
1334 if match_filter is not None:
1335 try:
1336 ret = match_filter(info_dict, incomplete=incomplete)
1337 except TypeError:
1338 # For backward compatibility
1339 ret = None if incomplete else match_filter(info_dict)
1340 if ret is NO_DEFAULT:
1341 while True:
1342 filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
1343 reply = input(self._format_screen(
1344 f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
1345 if reply in {'y', ''}:
1346 return None
1347 elif reply == 'n':
1348 return f'Skipping {video_title}'
1349 elif ret is not None:
1350 return ret
1351 return None
1352
1353 if self.in_download_archive(info_dict):
1354 reason = '%s has already been recorded in the archive' % video_title
1355 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1356 else:
1357 reason = check_filter()
1358 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1359 if reason is not None:
1360 if not silent:
1361 self.to_screen('[download] ' + reason)
1362 if self.params.get(break_opt, False):
1363 raise break_err()
1364 return reason
1365
1366 @staticmethod
1367 def add_extra_info(info_dict, extra_info):
1368 '''Set the keys from extra_info in info dict if they are missing'''
1369 for key, value in extra_info.items():
1370 info_dict.setdefault(key, value)
1371
1372 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1373 process=True, force_generic_extractor=False):
1374 """
1375 Return a list with a dictionary for each video extracted.
1376
1377 Arguments:
1378 url -- URL to extract
1379
1380 Keyword arguments:
1381 download -- whether to download videos during extraction
1382 ie_key -- extractor key hint
1383 extra_info -- dictionary containing the extra values to add to each result
1384 process -- whether to resolve all unresolved references (URLs, playlist items),
1385 must be True for download to work.
1386 force_generic_extractor -- force using the generic extractor
1387 """
1388
1389 if extra_info is None:
1390 extra_info = {}
1391
1392 if not ie_key and force_generic_extractor:
1393 ie_key = 'Generic'
1394
1395 if ie_key:
1396 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1397 else:
1398 ies = self._ies
1399
1400 for ie_key, ie in ies.items():
1401 if not ie.suitable(url):
1402 continue
1403
1404 if not ie.working():
1405 self.report_warning('The program functionality for this site has been marked as broken, '
1406 'and will probably not work.')
1407
1408 temp_id = ie.get_temp_id(url)
1409 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1410 self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
1411 if self.params.get('break_on_existing', False):
1412 raise ExistingVideoReached()
1413 break
1414 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1415 else:
1416 self.report_error('no suitable InfoExtractor for URL %s' % url)
1417
1418 def _handle_extraction_exceptions(func):
1419 @functools.wraps(func)
1420 def wrapper(self, *args, **kwargs):
1421 while True:
1422 try:
1423 return func(self, *args, **kwargs)
1424 except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
1425 raise
1426 except ReExtractInfo as e:
1427 if e.expected:
1428 self.to_screen(f'{e}; Re-extracting data')
1429 else:
1430 self.to_stderr('\r')
1431 self.report_warning(f'{e}; Re-extracting data')
1432 continue
1433 except GeoRestrictedError as e:
1434 msg = e.msg
1435 if e.countries:
1436 msg += '\nThis video is available in %s.' % ', '.join(
1437 map(ISO3166Utils.short2full, e.countries))
1438 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1439 self.report_error(msg)
1440 except ExtractorError as e: # An error we somewhat expected
1441 self.report_error(str(e), e.format_traceback())
1442 except Exception as e:
1443 if self.params.get('ignoreerrors'):
1444 self.report_error(str(e), tb=encode_compat_str(traceback.format_exc()))
1445 else:
1446 raise
1447 break
1448 return wrapper
1449
1450 def _wait_for_video(self, ie_result):
1451 if (not self.params.get('wait_for_video')
1452 or ie_result.get('_type', 'video') != 'video'
1453 or ie_result.get('formats') or ie_result.get('url')):
1454 return
1455
1456 format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
1457 last_msg = ''
1458
1459 def progress(msg):
1460 nonlocal last_msg
1461 self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True)
1462 last_msg = msg
1463
1464 min_wait, max_wait = self.params.get('wait_for_video')
1465 diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
1466 if diff is None and ie_result.get('live_status') == 'is_upcoming':
1467 diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
1468 self.report_warning('Release time of video is not known')
1469 elif (diff or 0) <= 0:
1470 self.report_warning('Video should already be available according to extracted info')
1471 diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
1472 self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
1473
1474 wait_till = time.time() + diff
1475 try:
1476 while True:
1477 diff = wait_till - time.time()
1478 if diff <= 0:
1479 progress('')
1480 raise ReExtractInfo('[wait] Wait period ended', expected=True)
1481 progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
1482 time.sleep(1)
1483 except KeyboardInterrupt:
1484 progress('')
1485 raise ReExtractInfo('[wait] Interrupted by user', expected=True)
1486 except BaseException as e:
1487 if not isinstance(e, ReExtractInfo):
1488 self.to_screen('')
1489 raise
1490
1491 @_handle_extraction_exceptions
1492 def __extract_info(self, url, ie, download, extra_info, process):
1493 ie_result = ie.extract(url)
1494 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1495 return
1496 if isinstance(ie_result, list):
1497 # Backwards compatibility: old IE result format
1498 ie_result = {
1499 '_type': 'compat_list',
1500 'entries': ie_result,
1501 }
1502 if extra_info.get('original_url'):
1503 ie_result.setdefault('original_url', extra_info['original_url'])
1504 self.add_default_extra_info(ie_result, ie, url)
1505 if process:
1506 self._wait_for_video(ie_result)
1507 return self.process_ie_result(ie_result, download, extra_info)
1508 else:
1509 return ie_result
1510
1511 def add_default_extra_info(self, ie_result, ie, url):
1512 if url is not None:
1513 self.add_extra_info(ie_result, {
1514 'webpage_url': url,
1515 'original_url': url,
1516 })
1517 webpage_url = ie_result.get('webpage_url')
1518 if webpage_url:
1519 self.add_extra_info(ie_result, {
1520 'webpage_url_basename': url_basename(webpage_url),
1521 'webpage_url_domain': get_domain(webpage_url),
1522 })
1523 if ie is not None:
1524 self.add_extra_info(ie_result, {
1525 'extractor': ie.IE_NAME,
1526 'extractor_key': ie.ie_key(),
1527 })
1528
1529 def process_ie_result(self, ie_result, download=True, extra_info=None):
1530 """
1531 Take the result of the ie(may be modified) and resolve all unresolved
1532 references (URLs, playlist items).
1533
1534 It will also download the videos if 'download'.
1535 Returns the resolved ie_result.
1536 """
1537 if extra_info is None:
1538 extra_info = {}
1539 result_type = ie_result.get('_type', 'video')
1540
1541 if result_type in ('url', 'url_transparent'):
1542 ie_result['url'] = sanitize_url(ie_result['url'])
1543 if ie_result.get('original_url'):
1544 extra_info.setdefault('original_url', ie_result['original_url'])
1545
1546 extract_flat = self.params.get('extract_flat', False)
1547 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1548 or extract_flat is True):
1549 info_copy = ie_result.copy()
1550 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1551 if ie and not ie_result.get('id'):
1552 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1553 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1554 self.add_extra_info(info_copy, extra_info)
1555 info_copy, _ = self.pre_process(info_copy)
1556 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1557 self._raise_pending_errors(info_copy)
1558 if self.params.get('force_write_download_archive', False):
1559 self.record_download_archive(info_copy)
1560 return ie_result
1561
1562 if result_type == 'video':
1563 self.add_extra_info(ie_result, extra_info)
1564 ie_result = self.process_video_result(ie_result, download=download)
1565 self._raise_pending_errors(ie_result)
1566 additional_urls = (ie_result or {}).get('additional_urls')
1567 if additional_urls:
1568 # TODO: Improve MetadataParserPP to allow setting a list
1569 if isinstance(additional_urls, str):
1570 additional_urls = [additional_urls]
1571 self.to_screen(
1572 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1573 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1574 ie_result['additional_entries'] = [
1575 self.extract_info(
1576 url, download, extra_info=extra_info,
1577 force_generic_extractor=self.params.get('force_generic_extractor'))
1578 for url in additional_urls
1579 ]
1580 return ie_result
1581 elif result_type == 'url':
1582 # We have to add extra_info to the results because it may be
1583 # contained in a playlist
1584 return self.extract_info(
1585 ie_result['url'], download,
1586 ie_key=ie_result.get('ie_key'),
1587 extra_info=extra_info)
1588 elif result_type == 'url_transparent':
1589 # Use the information from the embedding page
1590 info = self.extract_info(
1591 ie_result['url'], ie_key=ie_result.get('ie_key'),
1592 extra_info=extra_info, download=False, process=False)
1593
1594 # extract_info may return None when ignoreerrors is enabled and
1595 # extraction failed with an error, don't crash and return early
1596 # in this case
1597 if not info:
1598 return info
1599
1600 exempted_fields = {'_type', 'url', 'ie_key'}
1601 if not ie_result.get('section_end') and ie_result.get('section_start') is None:
1602 # For video clips, the id etc of the clip extractor should be used
1603 exempted_fields |= {'id', 'extractor', 'extractor_key'}
1604
1605 new_result = info.copy()
1606 new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields))
1607
1608 # Extracted info may not be a video result (i.e.
1609 # info.get('_type', 'video') != video) but rather an url or
1610 # url_transparent. In such cases outer metadata (from ie_result)
1611 # should be propagated to inner one (info). For this to happen
1612 # _type of info should be overridden with url_transparent. This
1613 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1614 if new_result.get('_type') == 'url':
1615 new_result['_type'] = 'url_transparent'
1616
1617 return self.process_ie_result(
1618 new_result, download=download, extra_info=extra_info)
1619 elif result_type in ('playlist', 'multi_video'):
1620 # Protect from infinite recursion due to recursively nested playlists
1621 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1622 webpage_url = ie_result['webpage_url']
1623 if webpage_url in self._playlist_urls:
1624 self.to_screen(
1625 '[download] Skipping already downloaded playlist: %s'
1626 % ie_result.get('title') or ie_result.get('id'))
1627 return
1628
1629 self._playlist_level += 1
1630 self._playlist_urls.add(webpage_url)
1631 self._fill_common_fields(ie_result, False)
1632 self._sanitize_thumbnails(ie_result)
1633 try:
1634 return self.__process_playlist(ie_result, download)
1635 finally:
1636 self._playlist_level -= 1
1637 if not self._playlist_level:
1638 self._playlist_urls.clear()
1639 elif result_type == 'compat_list':
1640 self.report_warning(
1641 'Extractor %s returned a compat_list result. '
1642 'It needs to be updated.' % ie_result.get('extractor'))
1643
1644 def _fixup(r):
1645 self.add_extra_info(r, {
1646 'extractor': ie_result['extractor'],
1647 'webpage_url': ie_result['webpage_url'],
1648 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1649 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1650 'extractor_key': ie_result['extractor_key'],
1651 })
1652 return r
1653 ie_result['entries'] = [
1654 self.process_ie_result(_fixup(r), download, extra_info)
1655 for r in ie_result['entries']
1656 ]
1657 return ie_result
1658 else:
1659 raise Exception('Invalid result type: %s' % result_type)
1660
1661 def _ensure_dir_exists(self, path):
1662 return make_dir(path, self.report_error)
1663
1664 @staticmethod
1665 def _playlist_infodict(ie_result, **kwargs):
1666 return {
1667 **ie_result,
1668 'playlist': ie_result.get('title') or ie_result.get('id'),
1669 'playlist_id': ie_result.get('id'),
1670 'playlist_title': ie_result.get('title'),
1671 'playlist_uploader': ie_result.get('uploader'),
1672 'playlist_uploader_id': ie_result.get('uploader_id'),
1673 'playlist_index': 0,
1674 **kwargs,
1675 }
1676
1677 def __process_playlist(self, ie_result, download):
1678 """Process each entry in the playlist"""
1679 title = ie_result.get('title') or ie_result.get('id') or '<Untitled>'
1680 self.to_screen(f'[download] Downloading playlist: {title}')
1681
1682 all_entries = PlaylistEntries(self, ie_result)
1683 entries = orderedSet(all_entries.get_requested_items(), lazy=True)
1684
1685 lazy = self.params.get('lazy_playlist')
1686 if lazy:
1687 resolved_entries, n_entries = [], 'N/A'
1688 ie_result['requested_entries'], ie_result['entries'] = None, None
1689 else:
1690 entries = resolved_entries = list(entries)
1691 n_entries = len(resolved_entries)
1692 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1693 if not ie_result.get('playlist_count'):
1694 # Better to do this after potentially exhausting entries
1695 ie_result['playlist_count'] = all_entries.get_full_count()
1696
1697 _infojson_written = False
1698 write_playlist_files = self.params.get('allow_playlist_files', True)
1699 if write_playlist_files and self.params.get('list_thumbnails'):
1700 self.list_thumbnails(ie_result)
1701 if write_playlist_files and not self.params.get('simulate'):
1702 ie_copy = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
1703 _infojson_written = self._write_info_json(
1704 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
1705 if _infojson_written is None:
1706 return
1707 if self._write_description('playlist', ie_result,
1708 self.prepare_filename(ie_copy, 'pl_description')) is None:
1709 return
1710 # TODO: This should be passed to ThumbnailsConvertor if necessary
1711 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1712
1713 if lazy:
1714 if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
1715 self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True)
1716 elif self.params.get('playlistreverse'):
1717 entries.reverse()
1718 elif self.params.get('playlistrandom'):
1719 random.shuffle(entries)
1720
1721 self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} videos'
1722 f'{format_field(ie_result, "playlist_count", " of %s")}')
1723
1724 failures = 0
1725 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1726 for i, (playlist_index, entry) in enumerate(entries):
1727 if lazy:
1728 resolved_entries.append((playlist_index, entry))
1729
1730 # TODO: Add auto-generated fields
1731 if not entry or self._match_entry(entry, incomplete=True) is not None:
1732 continue
1733
1734 self.to_screen('[download] Downloading video %s of %s' % (
1735 self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
1736
1737 entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
1738 if not lazy and 'playlist-index' in self.params.get('compat_opts', []):
1739 playlist_index = ie_result['requested_entries'][i]
1740
1741 entry_result = self.__process_iterable_entry(entry, download, {
1742 'n_entries': int_or_none(n_entries),
1743 '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
1744 'playlist_count': ie_result.get('playlist_count'),
1745 'playlist_index': playlist_index,
1746 'playlist_autonumber': i + 1,
1747 'playlist': title,
1748 'playlist_id': ie_result.get('id'),
1749 'playlist_title': ie_result.get('title'),
1750 'playlist_uploader': ie_result.get('uploader'),
1751 'playlist_uploader_id': ie_result.get('uploader_id'),
1752 'extractor': ie_result['extractor'],
1753 'webpage_url': ie_result['webpage_url'],
1754 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1755 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1756 'extractor_key': ie_result['extractor_key'],
1757 })
1758 if not entry_result:
1759 failures += 1
1760 if failures >= max_failures:
1761 self.report_error(
1762 f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction')
1763 break
1764 resolved_entries[i] = (playlist_index, entry_result)
1765
1766 # Update with processed data
1767 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1768
1769 # Write the updated info to json
1770 if _infojson_written is True and self._write_info_json(
1771 'updated playlist', ie_result,
1772 self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
1773 return
1774
1775 ie_result = self.run_all_pps('playlist', ie_result)
1776 self.to_screen(f'[download] Finished downloading playlist: {title}')
1777 return ie_result
1778
1779 @_handle_extraction_exceptions
1780 def __process_iterable_entry(self, entry, download, extra_info):
1781 return self.process_ie_result(
1782 entry, download=download, extra_info=extra_info)
1783
1784 def _build_format_filter(self, filter_spec):
1785 " Returns a function to filter the formats according to the filter_spec "
1786
1787 OPERATORS = {
1788 '<': operator.lt,
1789 '<=': operator.le,
1790 '>': operator.gt,
1791 '>=': operator.ge,
1792 '=': operator.eq,
1793 '!=': operator.ne,
1794 }
1795 operator_rex = re.compile(r'''(?x)\s*
1796 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1797 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1798 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1799 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1800 m = operator_rex.fullmatch(filter_spec)
1801 if m:
1802 try:
1803 comparison_value = int(m.group('value'))
1804 except ValueError:
1805 comparison_value = parse_filesize(m.group('value'))
1806 if comparison_value is None:
1807 comparison_value = parse_filesize(m.group('value') + 'B')
1808 if comparison_value is None:
1809 raise ValueError(
1810 'Invalid value %r in format specification %r' % (
1811 m.group('value'), filter_spec))
1812 op = OPERATORS[m.group('op')]
1813
1814 if not m:
1815 STR_OPERATORS = {
1816 '=': operator.eq,
1817 '^=': lambda attr, value: attr.startswith(value),
1818 '$=': lambda attr, value: attr.endswith(value),
1819 '*=': lambda attr, value: value in attr,
1820 '~=': lambda attr, value: value.search(attr) is not None
1821 }
1822 str_operator_rex = re.compile(r'''(?x)\s*
1823 (?P<key>[a-zA-Z0-9._-]+)\s*
1824 (?P<negation>!\s*)?(?P<op>%s)\s*(?P<none_inclusive>\?\s*)?
1825 (?P<quote>["'])?
1826 (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+))
1827 (?(quote)(?P=quote))\s*
1828 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1829 m = str_operator_rex.fullmatch(filter_spec)
1830 if m:
1831 if m.group('op') == '~=':
1832 comparison_value = re.compile(m.group('value'))
1833 else:
1834 comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value'))
1835 str_op = STR_OPERATORS[m.group('op')]
1836 if m.group('negation'):
1837 op = lambda attr, value: not str_op(attr, value)
1838 else:
1839 op = str_op
1840
1841 if not m:
1842 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1843
1844 def _filter(f):
1845 actual_value = f.get(m.group('key'))
1846 if actual_value is None:
1847 return m.group('none_inclusive')
1848 return op(actual_value, comparison_value)
1849 return _filter
1850
1851 def _check_formats(self, formats):
1852 for f in formats:
1853 self.to_screen('[info] Testing format %s' % f['format_id'])
1854 path = self.get_output_path('temp')
1855 if not self._ensure_dir_exists(f'{path}/'):
1856 continue
1857 temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
1858 temp_file.close()
1859 try:
1860 success, _ = self.dl(temp_file.name, f, test=True)
1861 except (DownloadError, OSError, ValueError) + network_exceptions:
1862 success = False
1863 finally:
1864 if os.path.exists(temp_file.name):
1865 try:
1866 os.remove(temp_file.name)
1867 except OSError:
1868 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1869 if success:
1870 yield f
1871 else:
1872 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1873
1874 def _default_format_spec(self, info_dict, download=True):
1875
1876 def can_merge():
1877 merger = FFmpegMergerPP(self)
1878 return merger.available and merger.can_merge()
1879
1880 prefer_best = (
1881 not self.params.get('simulate')
1882 and download
1883 and (
1884 not can_merge()
1885 or info_dict.get('is_live') and not self.params.get('live_from_start')
1886 or self.params['outtmpl']['default'] == '-'))
1887 compat = (
1888 prefer_best
1889 or self.params.get('allow_multiple_audio_streams', False)
1890 or 'format-spec' in self.params['compat_opts'])
1891
1892 return (
1893 'best/bestvideo+bestaudio' if prefer_best
1894 else 'bestvideo*+bestaudio/best' if not compat
1895 else 'bestvideo+bestaudio/best')
1896
1897 def build_format_selector(self, format_spec):
1898 def syntax_error(note, start):
1899 message = (
1900 'Invalid format specification: '
1901 '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1]))
1902 return SyntaxError(message)
1903
1904 PICKFIRST = 'PICKFIRST'
1905 MERGE = 'MERGE'
1906 SINGLE = 'SINGLE'
1907 GROUP = 'GROUP'
1908 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1909
1910 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1911 'video': self.params.get('allow_multiple_video_streams', False)}
1912
1913 check_formats = self.params.get('check_formats') == 'selected'
1914
1915 def _parse_filter(tokens):
1916 filter_parts = []
1917 for type, string, start, _, _ in tokens:
1918 if type == tokenize.OP and string == ']':
1919 return ''.join(filter_parts)
1920 else:
1921 filter_parts.append(string)
1922
1923 def _remove_unused_ops(tokens):
1924 # Remove operators that we don't use and join them with the surrounding strings
1925 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1926 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1927 last_string, last_start, last_end, last_line = None, None, None, None
1928 for type, string, start, end, line in tokens:
1929 if type == tokenize.OP and string == '[':
1930 if last_string:
1931 yield tokenize.NAME, last_string, last_start, last_end, last_line
1932 last_string = None
1933 yield type, string, start, end, line
1934 # everything inside brackets will be handled by _parse_filter
1935 for type, string, start, end, line in tokens:
1936 yield type, string, start, end, line
1937 if type == tokenize.OP and string == ']':
1938 break
1939 elif type == tokenize.OP and string in ALLOWED_OPS:
1940 if last_string:
1941 yield tokenize.NAME, last_string, last_start, last_end, last_line
1942 last_string = None
1943 yield type, string, start, end, line
1944 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1945 if not last_string:
1946 last_string = string
1947 last_start = start
1948 last_end = end
1949 else:
1950 last_string += string
1951 if last_string:
1952 yield tokenize.NAME, last_string, last_start, last_end, last_line
1953
1954 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1955 selectors = []
1956 current_selector = None
1957 for type, string, start, _, _ in tokens:
1958 # ENCODING is only defined in python 3.x
1959 if type == getattr(tokenize, 'ENCODING', None):
1960 continue
1961 elif type in [tokenize.NAME, tokenize.NUMBER]:
1962 current_selector = FormatSelector(SINGLE, string, [])
1963 elif type == tokenize.OP:
1964 if string == ')':
1965 if not inside_group:
1966 # ')' will be handled by the parentheses group
1967 tokens.restore_last_token()
1968 break
1969 elif inside_merge and string in ['/', ',']:
1970 tokens.restore_last_token()
1971 break
1972 elif inside_choice and string == ',':
1973 tokens.restore_last_token()
1974 break
1975 elif string == ',':
1976 if not current_selector:
1977 raise syntax_error('"," must follow a format selector', start)
1978 selectors.append(current_selector)
1979 current_selector = None
1980 elif string == '/':
1981 if not current_selector:
1982 raise syntax_error('"/" must follow a format selector', start)
1983 first_choice = current_selector
1984 second_choice = _parse_format_selection(tokens, inside_choice=True)
1985 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1986 elif string == '[':
1987 if not current_selector:
1988 current_selector = FormatSelector(SINGLE, 'best', [])
1989 format_filter = _parse_filter(tokens)
1990 current_selector.filters.append(format_filter)
1991 elif string == '(':
1992 if current_selector:
1993 raise syntax_error('Unexpected "("', start)
1994 group = _parse_format_selection(tokens, inside_group=True)
1995 current_selector = FormatSelector(GROUP, group, [])
1996 elif string == '+':
1997 if not current_selector:
1998 raise syntax_error('Unexpected "+"', start)
1999 selector_1 = current_selector
2000 selector_2 = _parse_format_selection(tokens, inside_merge=True)
2001 if not selector_2:
2002 raise syntax_error('Expected a selector', start)
2003 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
2004 else:
2005 raise syntax_error(f'Operator not recognized: "{string}"', start)
2006 elif type == tokenize.ENDMARKER:
2007 break
2008 if current_selector:
2009 selectors.append(current_selector)
2010 return selectors
2011
2012 def _merge(formats_pair):
2013 format_1, format_2 = formats_pair
2014
2015 formats_info = []
2016 formats_info.extend(format_1.get('requested_formats', (format_1,)))
2017 formats_info.extend(format_2.get('requested_formats', (format_2,)))
2018
2019 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
2020 get_no_more = {'video': False, 'audio': False}
2021 for (i, fmt_info) in enumerate(formats_info):
2022 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
2023 formats_info.pop(i)
2024 continue
2025 for aud_vid in ['audio', 'video']:
2026 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
2027 if get_no_more[aud_vid]:
2028 formats_info.pop(i)
2029 break
2030 get_no_more[aud_vid] = True
2031
2032 if len(formats_info) == 1:
2033 return formats_info[0]
2034
2035 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
2036 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
2037
2038 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
2039 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
2040
2041 output_ext = self.params.get('merge_output_format')
2042 if not output_ext:
2043 if the_only_video:
2044 output_ext = the_only_video['ext']
2045 elif the_only_audio and not video_fmts:
2046 output_ext = the_only_audio['ext']
2047 else:
2048 output_ext = 'mkv'
2049
2050 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
2051
2052 new_dict = {
2053 'requested_formats': formats_info,
2054 'format': '+'.join(filtered('format')),
2055 'format_id': '+'.join(filtered('format_id')),
2056 'ext': output_ext,
2057 'protocol': '+'.join(map(determine_protocol, formats_info)),
2058 'language': '+'.join(orderedSet(filtered('language'))) or None,
2059 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
2060 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
2061 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
2062 }
2063
2064 if the_only_video:
2065 new_dict.update({
2066 'width': the_only_video.get('width'),
2067 'height': the_only_video.get('height'),
2068 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
2069 'fps': the_only_video.get('fps'),
2070 'dynamic_range': the_only_video.get('dynamic_range'),
2071 'vcodec': the_only_video.get('vcodec'),
2072 'vbr': the_only_video.get('vbr'),
2073 'stretched_ratio': the_only_video.get('stretched_ratio'),
2074 })
2075
2076 if the_only_audio:
2077 new_dict.update({
2078 'acodec': the_only_audio.get('acodec'),
2079 'abr': the_only_audio.get('abr'),
2080 'asr': the_only_audio.get('asr'),
2081 })
2082
2083 return new_dict
2084
2085 def _check_formats(formats):
2086 if not check_formats:
2087 yield from formats
2088 return
2089 yield from self._check_formats(formats)
2090
2091 def _build_selector_function(selector):
2092 if isinstance(selector, list): # ,
2093 fs = [_build_selector_function(s) for s in selector]
2094
2095 def selector_function(ctx):
2096 for f in fs:
2097 yield from f(ctx)
2098 return selector_function
2099
2100 elif selector.type == GROUP: # ()
2101 selector_function = _build_selector_function(selector.selector)
2102
2103 elif selector.type == PICKFIRST: # /
2104 fs = [_build_selector_function(s) for s in selector.selector]
2105
2106 def selector_function(ctx):
2107 for f in fs:
2108 picked_formats = list(f(ctx))
2109 if picked_formats:
2110 return picked_formats
2111 return []
2112
2113 elif selector.type == MERGE: # +
2114 selector_1, selector_2 = map(_build_selector_function, selector.selector)
2115
2116 def selector_function(ctx):
2117 for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
2118 yield _merge(pair)
2119
2120 elif selector.type == SINGLE: # atom
2121 format_spec = selector.selector or 'best'
2122
2123 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2124 if format_spec == 'all':
2125 def selector_function(ctx):
2126 yield from _check_formats(ctx['formats'][::-1])
2127 elif format_spec == 'mergeall':
2128 def selector_function(ctx):
2129 formats = list(_check_formats(
2130 f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none'))
2131 if not formats:
2132 return
2133 merged_format = formats[-1]
2134 for f in formats[-2::-1]:
2135 merged_format = _merge((merged_format, f))
2136 yield merged_format
2137
2138 else:
2139 format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
2140 mobj = re.match(
2141 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2142 format_spec)
2143 if mobj is not None:
2144 format_idx = int_or_none(mobj.group('n'), default=1)
2145 format_reverse = mobj.group('bw')[0] == 'b'
2146 format_type = (mobj.group('type') or [None])[0]
2147 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2148 format_modified = mobj.group('mod') is not None
2149
2150 format_fallback = not format_type and not format_modified # for b, w
2151 _filter_f = (
2152 (lambda f: f.get('%scodec' % format_type) != 'none')
2153 if format_type and format_modified # bv*, ba*, wv*, wa*
2154 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
2155 if format_type # bv, ba, wv, wa
2156 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2157 if not format_modified # b, w
2158 else lambda f: True) # b*, w*
2159 filter_f = lambda f: _filter_f(f) and (
2160 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2161 else:
2162 if format_spec in self._format_selection_exts['audio']:
2163 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2164 elif format_spec in self._format_selection_exts['video']:
2165 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2166 seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
2167 elif format_spec in self._format_selection_exts['storyboards']:
2168 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2169 else:
2170 filter_f = lambda f: f.get('format_id') == format_spec # id
2171
2172 def selector_function(ctx):
2173 formats = list(ctx['formats'])
2174 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2175 if not matches:
2176 if format_fallback and ctx['incomplete_formats']:
2177 # for extractors with incomplete formats (audio only (soundcloud)
2178 # or video only (imgur)) best/worst will fallback to
2179 # best/worst {video,audio}-only format
2180 matches = formats
2181 elif seperate_fallback and not ctx['has_merged_format']:
2182 # for compatibility with youtube-dl when there is no pre-merged format
2183 matches = list(filter(seperate_fallback, formats))
2184 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2185 try:
2186 yield matches[format_idx - 1]
2187 except LazyList.IndexError:
2188 return
2189
2190 filters = [self._build_format_filter(f) for f in selector.filters]
2191
2192 def final_selector(ctx):
2193 ctx_copy = dict(ctx)
2194 for _filter in filters:
2195 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2196 return selector_function(ctx_copy)
2197 return final_selector
2198
2199 stream = io.BytesIO(format_spec.encode())
2200 try:
2201 tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
2202 except tokenize.TokenError:
2203 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2204
2205 class TokenIterator:
2206 def __init__(self, tokens):
2207 self.tokens = tokens
2208 self.counter = 0
2209
2210 def __iter__(self):
2211 return self
2212
2213 def __next__(self):
2214 if self.counter >= len(self.tokens):
2215 raise StopIteration()
2216 value = self.tokens[self.counter]
2217 self.counter += 1
2218 return value
2219
2220 next = __next__
2221
2222 def restore_last_token(self):
2223 self.counter -= 1
2224
2225 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2226 return _build_selector_function(parsed_selector)
2227
2228 def _calc_headers(self, info_dict):
2229 res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
2230
2231 cookies = self._calc_cookies(info_dict['url'])
2232 if cookies:
2233 res['Cookie'] = cookies
2234
2235 if 'X-Forwarded-For' not in res:
2236 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2237 if x_forwarded_for_ip:
2238 res['X-Forwarded-For'] = x_forwarded_for_ip
2239
2240 return res
2241
2242 def _calc_cookies(self, url):
2243 pr = sanitized_Request(url)
2244 self.cookiejar.add_cookie_header(pr)
2245 return pr.get_header('Cookie')
2246
2247 def _sort_thumbnails(self, thumbnails):
2248 thumbnails.sort(key=lambda t: (
2249 t.get('preference') if t.get('preference') is not None else -1,
2250 t.get('width') if t.get('width') is not None else -1,
2251 t.get('height') if t.get('height') is not None else -1,
2252 t.get('id') if t.get('id') is not None else '',
2253 t.get('url')))
2254
2255 def _sanitize_thumbnails(self, info_dict):
2256 thumbnails = info_dict.get('thumbnails')
2257 if thumbnails is None:
2258 thumbnail = info_dict.get('thumbnail')
2259 if thumbnail:
2260 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2261 if not thumbnails:
2262 return
2263
2264 def check_thumbnails(thumbnails):
2265 for t in thumbnails:
2266 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2267 try:
2268 self.urlopen(HEADRequest(t['url']))
2269 except network_exceptions as err:
2270 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2271 continue
2272 yield t
2273
2274 self._sort_thumbnails(thumbnails)
2275 for i, t in enumerate(thumbnails):
2276 if t.get('id') is None:
2277 t['id'] = '%d' % i
2278 if t.get('width') and t.get('height'):
2279 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2280 t['url'] = sanitize_url(t['url'])
2281
2282 if self.params.get('check_formats') is True:
2283 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
2284 else:
2285 info_dict['thumbnails'] = thumbnails
2286
2287 def _fill_common_fields(self, info_dict, is_video=True):
2288 # TODO: move sanitization here
2289 if is_video:
2290 # playlists are allowed to lack "title"
2291 title = info_dict.get('title', NO_DEFAULT)
2292 if title is NO_DEFAULT:
2293 raise ExtractorError('Missing "title" field in extractor result',
2294 video_id=info_dict['id'], ie=info_dict['extractor'])
2295 info_dict['fulltitle'] = title
2296 if not title:
2297 if title == '':
2298 self.write_debug('Extractor gave empty title. Creating a generic title')
2299 else:
2300 self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
2301 info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}'
2302
2303 if info_dict.get('duration') is not None:
2304 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2305
2306 for ts_key, date_key in (
2307 ('timestamp', 'upload_date'),
2308 ('release_timestamp', 'release_date'),
2309 ('modified_timestamp', 'modified_date'),
2310 ):
2311 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2312 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2313 # see http://bugs.python.org/issue1646728)
2314 with contextlib.suppress(ValueError, OverflowError, OSError):
2315 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2316 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2317
2318 live_keys = ('is_live', 'was_live')
2319 live_status = info_dict.get('live_status')
2320 if live_status is None:
2321 for key in live_keys:
2322 if info_dict.get(key) is False:
2323 continue
2324 if info_dict.get(key):
2325 live_status = key
2326 break
2327 if all(info_dict.get(key) is False for key in live_keys):
2328 live_status = 'not_live'
2329 if live_status:
2330 info_dict['live_status'] = live_status
2331 for key in live_keys:
2332 if info_dict.get(key) is None:
2333 info_dict[key] = (live_status == key)
2334
2335 # Auto generate title fields corresponding to the *_number fields when missing
2336 # in order to always have clean titles. This is very common for TV series.
2337 for field in ('chapter', 'season', 'episode'):
2338 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2339 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2340
2341 def _raise_pending_errors(self, info):
2342 err = info.pop('__pending_error', None)
2343 if err:
2344 self.report_error(err, tb=False)
2345
2346 def process_video_result(self, info_dict, download=True):
2347 assert info_dict.get('_type', 'video') == 'video'
2348 self._num_videos += 1
2349
2350 if 'id' not in info_dict:
2351 raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
2352 elif not info_dict.get('id'):
2353 raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
2354
2355 def report_force_conversion(field, field_not, conversion):
2356 self.report_warning(
2357 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2358 % (field, field_not, conversion))
2359
2360 def sanitize_string_field(info, string_field):
2361 field = info.get(string_field)
2362 if field is None or isinstance(field, str):
2363 return
2364 report_force_conversion(string_field, 'a string', 'string')
2365 info[string_field] = str(field)
2366
2367 def sanitize_numeric_fields(info):
2368 for numeric_field in self._NUMERIC_FIELDS:
2369 field = info.get(numeric_field)
2370 if field is None or isinstance(field, (int, float)):
2371 continue
2372 report_force_conversion(numeric_field, 'numeric', 'int')
2373 info[numeric_field] = int_or_none(field)
2374
2375 sanitize_string_field(info_dict, 'id')
2376 sanitize_numeric_fields(info_dict)
2377 if info_dict.get('section_end') and info_dict.get('section_start') is not None:
2378 info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3)
2379 if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
2380 self.report_warning('"duration" field is negative, there is an error in extractor')
2381
2382 chapters = info_dict.get('chapters') or []
2383 if chapters and chapters[0].get('start_time'):
2384 chapters.insert(0, {'start_time': 0})
2385
2386 dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')}
2387 for idx, (prev, current, next_) in enumerate(zip(
2388 (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1):
2389 if current.get('start_time') is None:
2390 current['start_time'] = prev.get('end_time')
2391 if not current.get('end_time'):
2392 current['end_time'] = next_.get('start_time')
2393 if not current.get('title'):
2394 current['title'] = f'<Untitled Chapter {idx}>'
2395
2396 if 'playlist' not in info_dict:
2397 # It isn't part of a playlist
2398 info_dict['playlist'] = None
2399 info_dict['playlist_index'] = None
2400
2401 self._sanitize_thumbnails(info_dict)
2402
2403 thumbnail = info_dict.get('thumbnail')
2404 thumbnails = info_dict.get('thumbnails')
2405 if thumbnail:
2406 info_dict['thumbnail'] = sanitize_url(thumbnail)
2407 elif thumbnails:
2408 info_dict['thumbnail'] = thumbnails[-1]['url']
2409
2410 if info_dict.get('display_id') is None and 'id' in info_dict:
2411 info_dict['display_id'] = info_dict['id']
2412
2413 self._fill_common_fields(info_dict)
2414
2415 for cc_kind in ('subtitles', 'automatic_captions'):
2416 cc = info_dict.get(cc_kind)
2417 if cc:
2418 for _, subtitle in cc.items():
2419 for subtitle_format in subtitle:
2420 if subtitle_format.get('url'):
2421 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2422 if subtitle_format.get('ext') is None:
2423 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2424
2425 automatic_captions = info_dict.get('automatic_captions')
2426 subtitles = info_dict.get('subtitles')
2427
2428 info_dict['requested_subtitles'] = self.process_subtitles(
2429 info_dict['id'], subtitles, automatic_captions)
2430
2431 if info_dict.get('formats') is None:
2432 # There's only one format available
2433 formats = [info_dict]
2434 else:
2435 formats = info_dict['formats']
2436
2437 # or None ensures --clean-infojson removes it
2438 info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
2439 if not self.params.get('allow_unplayable_formats'):
2440 formats = [f for f in formats if not f.get('has_drm')]
2441 if info_dict['_has_drm'] and all(
2442 f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
2443 self.report_warning(
2444 'This video is DRM protected and only images are available for download. '
2445 'Use --list-formats to see them')
2446
2447 get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
2448 if not get_from_start:
2449 info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
2450 if info_dict.get('is_live') and formats:
2451 formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
2452 if get_from_start and not formats:
2453 self.raise_no_formats(info_dict, msg=(
2454 '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
2455 'If you want to download from the current time, use --no-live-from-start'))
2456
2457 if not formats:
2458 self.raise_no_formats(info_dict)
2459
2460 def is_wellformed(f):
2461 url = f.get('url')
2462 if not url:
2463 self.report_warning(
2464 '"url" field is missing or empty - skipping format, '
2465 'there is an error in extractor')
2466 return False
2467 if isinstance(url, bytes):
2468 sanitize_string_field(f, 'url')
2469 return True
2470
2471 # Filter out malformed formats for better extraction robustness
2472 formats = list(filter(is_wellformed, formats))
2473
2474 formats_dict = {}
2475
2476 # We check that all the formats have the format and format_id fields
2477 for i, format in enumerate(formats):
2478 sanitize_string_field(format, 'format_id')
2479 sanitize_numeric_fields(format)
2480 format['url'] = sanitize_url(format['url'])
2481 if not format.get('format_id'):
2482 format['format_id'] = str(i)
2483 else:
2484 # Sanitize format_id from characters used in format selector expression
2485 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2486 format_id = format['format_id']
2487 if format_id not in formats_dict:
2488 formats_dict[format_id] = []
2489 formats_dict[format_id].append(format)
2490
2491 # Make sure all formats have unique format_id
2492 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2493 for format_id, ambiguous_formats in formats_dict.items():
2494 ambigious_id = len(ambiguous_formats) > 1
2495 for i, format in enumerate(ambiguous_formats):
2496 if ambigious_id:
2497 format['format_id'] = '%s-%d' % (format_id, i)
2498 if format.get('ext') is None:
2499 format['ext'] = determine_ext(format['url']).lower()
2500 # Ensure there is no conflict between id and ext in format selection
2501 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2502 if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
2503 format['format_id'] = 'f%s' % format['format_id']
2504
2505 for i, format in enumerate(formats):
2506 if format.get('format') is None:
2507 format['format'] = '{id} - {res}{note}'.format(
2508 id=format['format_id'],
2509 res=self.format_resolution(format),
2510 note=format_field(format, 'format_note', ' (%s)'),
2511 )
2512 if format.get('protocol') is None:
2513 format['protocol'] = determine_protocol(format)
2514 if format.get('resolution') is None:
2515 format['resolution'] = self.format_resolution(format, default=None)
2516 if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
2517 format['dynamic_range'] = 'SDR'
2518 if (info_dict.get('duration') and format.get('tbr')
2519 and not format.get('filesize') and not format.get('filesize_approx')):
2520 format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
2521
2522 # Add HTTP headers, so that external programs can use them from the
2523 # json output
2524 full_format_info = info_dict.copy()
2525 full_format_info.update(format)
2526 format['http_headers'] = self._calc_headers(full_format_info)
2527 # Remove private housekeeping stuff
2528 if '__x_forwarded_for_ip' in info_dict:
2529 del info_dict['__x_forwarded_for_ip']
2530
2531 if self.params.get('check_formats') is True:
2532 formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
2533
2534 if not formats or formats[0] is not info_dict:
2535 # only set the 'formats' fields if the original info_dict list them
2536 # otherwise we end up with a circular reference, the first (and unique)
2537 # element in the 'formats' field in info_dict is info_dict itself,
2538 # which can't be exported to json
2539 info_dict['formats'] = formats
2540
2541 info_dict, _ = self.pre_process(info_dict)
2542
2543 if self._match_entry(info_dict, incomplete=self._format_fields) is not None:
2544 return info_dict
2545
2546 self.post_extract(info_dict)
2547 info_dict, _ = self.pre_process(info_dict, 'after_filter')
2548
2549 # The pre-processors may have modified the formats
2550 formats = info_dict.get('formats', [info_dict])
2551
2552 list_only = self.params.get('simulate') is None and (
2553 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2554 interactive_format_selection = not list_only and self.format_selector == '-'
2555 if self.params.get('list_thumbnails'):
2556 self.list_thumbnails(info_dict)
2557 if self.params.get('listsubtitles'):
2558 if 'automatic_captions' in info_dict:
2559 self.list_subtitles(
2560 info_dict['id'], automatic_captions, 'automatic captions')
2561 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2562 if self.params.get('listformats') or interactive_format_selection:
2563 self.list_formats(info_dict)
2564 if list_only:
2565 # Without this printing, -F --print-json will not work
2566 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2567 return info_dict
2568
2569 format_selector = self.format_selector
2570 if format_selector is None:
2571 req_format = self._default_format_spec(info_dict, download=download)
2572 self.write_debug('Default format spec: %s' % req_format)
2573 format_selector = self.build_format_selector(req_format)
2574
2575 while True:
2576 if interactive_format_selection:
2577 req_format = input(
2578 self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS))
2579 try:
2580 format_selector = self.build_format_selector(req_format)
2581 except SyntaxError as err:
2582 self.report_error(err, tb=False, is_error=False)
2583 continue
2584
2585 formats_to_download = list(format_selector({
2586 'formats': formats,
2587 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
2588 'incomplete_formats': (
2589 # All formats are video-only or
2590 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2591 # all formats are audio-only
2592 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)),
2593 }))
2594 if interactive_format_selection and not formats_to_download:
2595 self.report_error('Requested format is not available', tb=False, is_error=False)
2596 continue
2597 break
2598
2599 if not formats_to_download:
2600 if not self.params.get('ignore_no_formats_error'):
2601 raise ExtractorError(
2602 'Requested format is not available. Use --list-formats for a list of available formats',
2603 expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
2604 self.report_warning('Requested format is not available')
2605 # Process what we can, even without any available formats.
2606 formats_to_download = [{}]
2607
2608 requested_ranges = self.params.get('download_ranges')
2609 if requested_ranges:
2610 requested_ranges = tuple(requested_ranges(info_dict, self))
2611
2612 best_format, downloaded_formats = formats_to_download[-1], []
2613 if download:
2614 if best_format:
2615 def to_screen(*msg):
2616 self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
2617
2618 to_screen(f'Downloading {len(formats_to_download)} format(s):',
2619 (f['format_id'] for f in formats_to_download))
2620 if requested_ranges:
2621 to_screen(f'Downloading {len(requested_ranges)} time ranges:',
2622 (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges))
2623 max_downloads_reached = False
2624
2625 for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]):
2626 new_info = self._copy_infodict(info_dict)
2627 new_info.update(fmt)
2628 offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
2629 if chapter or offset:
2630 new_info.update({
2631 'section_start': offset + chapter.get('start_time', 0),
2632 'section_end': offset + min(chapter.get('end_time', duration), duration),
2633 'section_title': chapter.get('title'),
2634 'section_number': chapter.get('index'),
2635 })
2636 downloaded_formats.append(new_info)
2637 try:
2638 self.process_info(new_info)
2639 except MaxDownloadsReached:
2640 max_downloads_reached = True
2641 self._raise_pending_errors(new_info)
2642 # Remove copied info
2643 for key, val in tuple(new_info.items()):
2644 if info_dict.get(key) == val:
2645 new_info.pop(key)
2646 if max_downloads_reached:
2647 break
2648
2649 write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats}
2650 assert write_archive.issubset({True, False, 'ignore'})
2651 if True in write_archive and False not in write_archive:
2652 self.record_download_archive(info_dict)
2653
2654 info_dict['requested_downloads'] = downloaded_formats
2655 info_dict = self.run_all_pps('after_video', info_dict)
2656 if max_downloads_reached:
2657 raise MaxDownloadsReached()
2658
2659 # We update the info dict with the selected best quality format (backwards compatibility)
2660 info_dict.update(best_format)
2661 return info_dict
2662
2663 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2664 """Select the requested subtitles and their format"""
2665 available_subs, normal_sub_langs = {}, []
2666 if normal_subtitles and self.params.get('writesubtitles'):
2667 available_subs.update(normal_subtitles)
2668 normal_sub_langs = tuple(normal_subtitles.keys())
2669 if automatic_captions and self.params.get('writeautomaticsub'):
2670 for lang, cap_info in automatic_captions.items():
2671 if lang not in available_subs:
2672 available_subs[lang] = cap_info
2673
2674 if (not self.params.get('writesubtitles') and not
2675 self.params.get('writeautomaticsub') or not
2676 available_subs):
2677 return None
2678
2679 all_sub_langs = tuple(available_subs.keys())
2680 if self.params.get('allsubtitles', False):
2681 requested_langs = all_sub_langs
2682 elif self.params.get('subtitleslangs', False):
2683 # A list is used so that the order of languages will be the same as
2684 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2685 requested_langs = []
2686 for lang_re in self.params.get('subtitleslangs'):
2687 discard = lang_re[0] == '-'
2688 if discard:
2689 lang_re = lang_re[1:]
2690 if lang_re == 'all':
2691 if discard:
2692 requested_langs = []
2693 else:
2694 requested_langs.extend(all_sub_langs)
2695 continue
2696 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2697 if discard:
2698 for lang in current_langs:
2699 while lang in requested_langs:
2700 requested_langs.remove(lang)
2701 else:
2702 requested_langs.extend(current_langs)
2703 requested_langs = orderedSet(requested_langs)
2704 elif normal_sub_langs:
2705 requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
2706 else:
2707 requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1]
2708 if requested_langs:
2709 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2710
2711 formats_query = self.params.get('subtitlesformat', 'best')
2712 formats_preference = formats_query.split('/') if formats_query else []
2713 subs = {}
2714 for lang in requested_langs:
2715 formats = available_subs.get(lang)
2716 if formats is None:
2717 self.report_warning(f'{lang} subtitles not available for {video_id}')
2718 continue
2719 for ext in formats_preference:
2720 if ext == 'best':
2721 f = formats[-1]
2722 break
2723 matches = list(filter(lambda f: f['ext'] == ext, formats))
2724 if matches:
2725 f = matches[-1]
2726 break
2727 else:
2728 f = formats[-1]
2729 self.report_warning(
2730 'No subtitle format found matching "%s" for language %s, '
2731 'using %s' % (formats_query, lang, f['ext']))
2732 subs[lang] = f
2733 return subs
2734
2735 def _forceprint(self, key, info_dict):
2736 if info_dict is None:
2737 return
2738 info_copy = info_dict.copy()
2739 info_copy['formats_table'] = self.render_formats_table(info_dict)
2740 info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
2741 info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
2742 info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
2743
2744 def format_tmpl(tmpl):
2745 mobj = re.match(r'\w+(=?)$', tmpl)
2746 if mobj and mobj.group(1):
2747 return f'{tmpl[:-1]} = %({tmpl[:-1]})r'
2748 elif mobj:
2749 return f'%({tmpl})s'
2750 return tmpl
2751
2752 for tmpl in self.params['forceprint'].get(key, []):
2753 self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
2754
2755 for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
2756 filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
2757 tmpl = format_tmpl(tmpl)
2758 self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
2759 if self._ensure_dir_exists(filename):
2760 with open(filename, 'a', encoding='utf-8') as f:
2761 f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
2762
2763 def __forced_printings(self, info_dict, filename, incomplete):
2764 def print_mandatory(field, actual_field=None):
2765 if actual_field is None:
2766 actual_field = field
2767 if (self.params.get('force%s' % field, False)
2768 and (not incomplete or info_dict.get(actual_field) is not None)):
2769 self.to_stdout(info_dict[actual_field])
2770
2771 def print_optional(field):
2772 if (self.params.get('force%s' % field, False)
2773 and info_dict.get(field) is not None):
2774 self.to_stdout(info_dict[field])
2775
2776 info_dict = info_dict.copy()
2777 if filename is not None:
2778 info_dict['filename'] = filename
2779 if info_dict.get('requested_formats') is not None:
2780 # For RTMP URLs, also include the playpath
2781 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2782 elif info_dict.get('url'):
2783 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2784
2785 if (self.params.get('forcejson')
2786 or self.params['forceprint'].get('video')
2787 or self.params['print_to_file'].get('video')):
2788 self.post_extract(info_dict)
2789 self._forceprint('video', info_dict)
2790
2791 print_mandatory('title')
2792 print_mandatory('id')
2793 print_mandatory('url', 'urls')
2794 print_optional('thumbnail')
2795 print_optional('description')
2796 print_optional('filename')
2797 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2798 self.to_stdout(formatSeconds(info_dict['duration']))
2799 print_mandatory('format')
2800
2801 if self.params.get('forcejson'):
2802 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2803
2804 def dl(self, name, info, subtitle=False, test=False):
2805 if not info.get('url'):
2806 self.raise_no_formats(info, True)
2807
2808 if test:
2809 verbose = self.params.get('verbose')
2810 params = {
2811 'test': True,
2812 'quiet': self.params.get('quiet') or not verbose,
2813 'verbose': verbose,
2814 'noprogress': not verbose,
2815 'nopart': True,
2816 'skip_unavailable_fragments': False,
2817 'keep_fragments': False,
2818 'overwrites': True,
2819 '_no_ytdl_file': True,
2820 }
2821 else:
2822 params = self.params
2823 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2824 if not test:
2825 for ph in self._progress_hooks:
2826 fd.add_progress_hook(ph)
2827 urls = '", "'.join(
2828 (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
2829 for f in info.get('requested_formats', []) or [info])
2830 self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"')
2831
2832 # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
2833 # But it may contain objects that are not deep-copyable
2834 new_info = self._copy_infodict(info)
2835 if new_info.get('http_headers') is None:
2836 new_info['http_headers'] = self._calc_headers(new_info)
2837 return fd.download(name, new_info, subtitle)
2838
2839 def existing_file(self, filepaths, *, default_overwrite=True):
2840 existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
2841 if existing_files and not self.params.get('overwrites', default_overwrite):
2842 return existing_files[0]
2843
2844 for file in existing_files:
2845 self.report_file_delete(file)
2846 os.remove(file)
2847 return None
2848
2849 def process_info(self, info_dict):
2850 """Process a single resolved IE result. (Modifies it in-place)"""
2851
2852 assert info_dict.get('_type', 'video') == 'video'
2853 original_infodict = info_dict
2854
2855 if 'format' not in info_dict and 'ext' in info_dict:
2856 info_dict['format'] = info_dict['ext']
2857
2858 # This is mostly just for backward compatibility of process_info
2859 # As a side-effect, this allows for format-specific filters
2860 if self._match_entry(info_dict) is not None:
2861 info_dict['__write_download_archive'] = 'ignore'
2862 return
2863
2864 # Does nothing under normal operation - for backward compatibility of process_info
2865 self.post_extract(info_dict)
2866 self._num_downloads += 1
2867
2868 # info_dict['_filename'] needs to be set for backward compatibility
2869 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2870 temp_filename = self.prepare_filename(info_dict, 'temp')
2871 files_to_move = {}
2872
2873 # Forced printings
2874 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2875
2876 def check_max_downloads():
2877 if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'):
2878 raise MaxDownloadsReached()
2879
2880 if self.params.get('simulate'):
2881 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
2882 check_max_downloads()
2883 return
2884
2885 if full_filename is None:
2886 return
2887 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2888 return
2889 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2890 return
2891
2892 if self._write_description('video', info_dict,
2893 self.prepare_filename(info_dict, 'description')) is None:
2894 return
2895
2896 sub_files = self._write_subtitles(info_dict, temp_filename)
2897 if sub_files is None:
2898 return
2899 files_to_move.update(dict(sub_files))
2900
2901 thumb_files = self._write_thumbnails(
2902 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2903 if thumb_files is None:
2904 return
2905 files_to_move.update(dict(thumb_files))
2906
2907 infofn = self.prepare_filename(info_dict, 'infojson')
2908 _infojson_written = self._write_info_json('video', info_dict, infofn)
2909 if _infojson_written:
2910 info_dict['infojson_filename'] = infofn
2911 # For backward compatibility, even though it was a private field
2912 info_dict['__infojson_filename'] = infofn
2913 elif _infojson_written is None:
2914 return
2915
2916 # Note: Annotations are deprecated
2917 annofn = None
2918 if self.params.get('writeannotations', False):
2919 annofn = self.prepare_filename(info_dict, 'annotation')
2920 if annofn:
2921 if not self._ensure_dir_exists(encodeFilename(annofn)):
2922 return
2923 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2924 self.to_screen('[info] Video annotations are already present')
2925 elif not info_dict.get('annotations'):
2926 self.report_warning('There are no annotations to write.')
2927 else:
2928 try:
2929 self.to_screen('[info] Writing video annotations to: ' + annofn)
2930 with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2931 annofile.write(info_dict['annotations'])
2932 except (KeyError, TypeError):
2933 self.report_warning('There are no annotations to write.')
2934 except OSError:
2935 self.report_error('Cannot write annotations file: ' + annofn)
2936 return
2937
2938 # Write internet shortcut files
2939 def _write_link_file(link_type):
2940 url = try_get(info_dict['webpage_url'], iri_to_uri)
2941 if not url:
2942 self.report_warning(
2943 f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
2944 return True
2945 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
2946 if not self._ensure_dir_exists(encodeFilename(linkfn)):
2947 return False
2948 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2949 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
2950 return True
2951 try:
2952 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
2953 with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
2954 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
2955 template_vars = {'url': url}
2956 if link_type == 'desktop':
2957 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
2958 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
2959 except OSError:
2960 self.report_error(f'Cannot write internet shortcut {linkfn}')
2961 return False
2962 return True
2963
2964 write_links = {
2965 'url': self.params.get('writeurllink'),
2966 'webloc': self.params.get('writewebloclink'),
2967 'desktop': self.params.get('writedesktoplink'),
2968 }
2969 if self.params.get('writelink'):
2970 link_type = ('webloc' if sys.platform == 'darwin'
2971 else 'desktop' if sys.platform.startswith('linux')
2972 else 'url')
2973 write_links[link_type] = True
2974
2975 if any(should_write and not _write_link_file(link_type)
2976 for link_type, should_write in write_links.items()):
2977 return
2978
2979 def replace_info_dict(new_info):
2980 nonlocal info_dict
2981 if new_info == info_dict:
2982 return
2983 info_dict.clear()
2984 info_dict.update(new_info)
2985
2986 new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2987 replace_info_dict(new_info)
2988
2989 if self.params.get('skip_download'):
2990 info_dict['filepath'] = temp_filename
2991 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2992 info_dict['__files_to_move'] = files_to_move
2993 replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
2994 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
2995 else:
2996 # Download
2997 info_dict.setdefault('__postprocessors', [])
2998 try:
2999
3000 def existing_video_file(*filepaths):
3001 ext = info_dict.get('ext')
3002 converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
3003 file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
3004 default_overwrite=False)
3005 if file:
3006 info_dict['ext'] = os.path.splitext(file)[1][1:]
3007 return file
3008
3009 fd, success = None, True
3010 if info_dict.get('protocol') or info_dict.get('url'):
3011 fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
3012 if fd is not FFmpegFD and (
3013 info_dict.get('section_start') or info_dict.get('section_end')):
3014 msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
3015 else 'You have requested downloading the video partially, but ffmpeg is not installed')
3016 self.report_error(f'{msg}. Aborting')
3017 return
3018
3019 if info_dict.get('requested_formats') is not None:
3020
3021 def compatible_formats(formats):
3022 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
3023 video_formats = [format for format in formats if format.get('vcodec') != 'none']
3024 audio_formats = [format for format in formats if format.get('acodec') != 'none']
3025 if len(video_formats) > 2 or len(audio_formats) > 2:
3026 return False
3027
3028 # Check extension
3029 exts = {format.get('ext') for format in formats}
3030 COMPATIBLE_EXTS = (
3031 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'},
3032 {'webm'},
3033 )
3034 for ext_sets in COMPATIBLE_EXTS:
3035 if ext_sets.issuperset(exts):
3036 return True
3037 # TODO: Check acodec/vcodec
3038 return False
3039
3040 requested_formats = info_dict['requested_formats']
3041 old_ext = info_dict['ext']
3042 if self.params.get('merge_output_format') is None:
3043 if not compatible_formats(requested_formats):
3044 info_dict['ext'] = 'mkv'
3045 self.report_warning(
3046 'Requested formats are incompatible for merge and will be merged into mkv')
3047 if (info_dict['ext'] == 'webm'
3048 and info_dict.get('thumbnails')
3049 # check with type instead of pp_key, __name__, or isinstance
3050 # since we dont want any custom PPs to trigger this
3051 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721
3052 info_dict['ext'] = 'mkv'
3053 self.report_warning(
3054 'webm doesn\'t support embedding a thumbnail, mkv will be used')
3055 new_ext = info_dict['ext']
3056
3057 def correct_ext(filename, ext=new_ext):
3058 if filename == '-':
3059 return filename
3060 filename_real_ext = os.path.splitext(filename)[1][1:]
3061 filename_wo_ext = (
3062 os.path.splitext(filename)[0]
3063 if filename_real_ext in (old_ext, new_ext)
3064 else filename)
3065 return f'{filename_wo_ext}.{ext}'
3066
3067 # Ensure filename always has a correct extension for successful merge
3068 full_filename = correct_ext(full_filename)
3069 temp_filename = correct_ext(temp_filename)
3070 dl_filename = existing_video_file(full_filename, temp_filename)
3071 info_dict['__real_download'] = False
3072
3073 merger = FFmpegMergerPP(self)
3074 downloaded = []
3075 if dl_filename is not None:
3076 self.report_file_already_downloaded(dl_filename)
3077 elif fd:
3078 for f in requested_formats if fd != FFmpegFD else []:
3079 f['filepath'] = fname = prepend_extension(
3080 correct_ext(temp_filename, info_dict['ext']),
3081 'f%s' % f['format_id'], info_dict['ext'])
3082 downloaded.append(fname)
3083 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
3084 success, real_download = self.dl(temp_filename, info_dict)
3085 info_dict['__real_download'] = real_download
3086 else:
3087 if self.params.get('allow_unplayable_formats'):
3088 self.report_warning(
3089 'You have requested merging of multiple formats '
3090 'while also allowing unplayable formats to be downloaded. '
3091 'The formats won\'t be merged to prevent data corruption.')
3092 elif not merger.available:
3093 msg = 'You have requested merging of multiple formats but ffmpeg is not installed'
3094 if not self.params.get('ignoreerrors'):
3095 self.report_error(f'{msg}. Aborting due to --abort-on-error')
3096 return
3097 self.report_warning(f'{msg}. The formats won\'t be merged')
3098
3099 if temp_filename == '-':
3100 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
3101 else 'but the formats are incompatible for simultaneous download' if merger.available
3102 else 'but ffmpeg is not installed')
3103 self.report_warning(
3104 f'You have requested downloading multiple formats to stdout {reason}. '
3105 'The formats will be streamed one after the other')
3106 fname = temp_filename
3107 for f in requested_formats:
3108 new_info = dict(info_dict)
3109 del new_info['requested_formats']
3110 new_info.update(f)
3111 if temp_filename != '-':
3112 fname = prepend_extension(
3113 correct_ext(temp_filename, new_info['ext']),
3114 'f%s' % f['format_id'], new_info['ext'])
3115 if not self._ensure_dir_exists(fname):
3116 return
3117 f['filepath'] = fname
3118 downloaded.append(fname)
3119 partial_success, real_download = self.dl(fname, new_info)
3120 info_dict['__real_download'] = info_dict['__real_download'] or real_download
3121 success = success and partial_success
3122
3123 if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
3124 info_dict['__postprocessors'].append(merger)
3125 info_dict['__files_to_merge'] = downloaded
3126 # Even if there were no downloads, it is being merged only now
3127 info_dict['__real_download'] = True
3128 else:
3129 for file in downloaded:
3130 files_to_move[file] = None
3131 else:
3132 # Just a single file
3133 dl_filename = existing_video_file(full_filename, temp_filename)
3134 if dl_filename is None or dl_filename == temp_filename:
3135 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
3136 # So we should try to resume the download
3137 success, real_download = self.dl(temp_filename, info_dict)
3138 info_dict['__real_download'] = real_download
3139 else:
3140 self.report_file_already_downloaded(dl_filename)
3141
3142 dl_filename = dl_filename or temp_filename
3143 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
3144
3145 except network_exceptions as err:
3146 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
3147 return
3148 except OSError as err:
3149 raise UnavailableVideoError(err)
3150 except (ContentTooShortError, ) as err:
3151 self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})')
3152 return
3153
3154 self._raise_pending_errors(info_dict)
3155 if success and full_filename != '-':
3156
3157 def fixup():
3158 do_fixup = True
3159 fixup_policy = self.params.get('fixup')
3160 vid = info_dict['id']
3161
3162 if fixup_policy in ('ignore', 'never'):
3163 return
3164 elif fixup_policy == 'warn':
3165 do_fixup = 'warn'
3166 elif fixup_policy != 'force':
3167 assert fixup_policy in ('detect_or_warn', None)
3168 if not info_dict.get('__real_download'):
3169 do_fixup = False
3170
3171 def ffmpeg_fixup(cndn, msg, cls):
3172 if not (do_fixup and cndn):
3173 return
3174 elif do_fixup == 'warn':
3175 self.report_warning(f'{vid}: {msg}')
3176 return
3177 pp = cls(self)
3178 if pp.available:
3179 info_dict['__postprocessors'].append(pp)
3180 else:
3181 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
3182
3183 stretched_ratio = info_dict.get('stretched_ratio')
3184 ffmpeg_fixup(
3185 stretched_ratio not in (1, None),
3186 f'Non-uniform pixel ratio {stretched_ratio}',
3187 FFmpegFixupStretchedPP)
3188
3189 ffmpeg_fixup(
3190 (info_dict.get('requested_formats') is None
3191 and info_dict.get('container') == 'm4a_dash'
3192 and info_dict.get('ext') == 'm4a'),
3193 'writing DASH m4a. Only some players support this container',
3194 FFmpegFixupM4aPP)
3195
3196 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
3197 downloader = downloader.FD_NAME if downloader else None
3198
3199 if info_dict.get('requested_formats') is None: # Not necessary if doing merger
3200 ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')
3201 or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
3202 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
3203 FFmpegFixupM3u8PP)
3204 ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
3205 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
3206
3207 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
3208 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP)
3209
3210 fixup()
3211 try:
3212 replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
3213 except PostProcessingError as err:
3214 self.report_error('Postprocessing: %s' % str(err))
3215 return
3216 try:
3217 for ph in self._post_hooks:
3218 ph(info_dict['filepath'])
3219 except Exception as err:
3220 self.report_error('post hooks: %s' % str(err))
3221 return
3222 info_dict['__write_download_archive'] = True
3223
3224 assert info_dict is original_infodict # Make sure the info_dict was modified in-place
3225 if self.params.get('force_write_download_archive'):
3226 info_dict['__write_download_archive'] = True
3227 check_max_downloads()
3228
3229 def __download_wrapper(self, func):
3230 @functools.wraps(func)
3231 def wrapper(*args, **kwargs):
3232 try:
3233 res = func(*args, **kwargs)
3234 except UnavailableVideoError as e:
3235 self.report_error(e)
3236 except DownloadCancelled as e:
3237 self.to_screen(f'[info] {e}')
3238 if not self.params.get('break_per_url'):
3239 raise
3240 else:
3241 if self.params.get('dump_single_json', False):
3242 self.post_extract(res)
3243 self.to_stdout(json.dumps(self.sanitize_info(res)))
3244 return wrapper
3245
3246 def download(self, url_list):
3247 """Download a given list of URLs."""
3248 url_list = variadic(url_list) # Passing a single URL is a common mistake
3249 outtmpl = self.params['outtmpl']['default']
3250 if (len(url_list) > 1
3251 and outtmpl != '-'
3252 and '%' not in outtmpl
3253 and self.params.get('max_downloads') != 1):
3254 raise SameFileError(outtmpl)
3255
3256 for url in url_list:
3257 self.__download_wrapper(self.extract_info)(
3258 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
3259
3260 return self._download_retcode
3261
3262 def download_with_info_file(self, info_filename):
3263 with contextlib.closing(fileinput.FileInput(
3264 [info_filename], mode='r',
3265 openhook=fileinput.hook_encoded('utf-8'))) as f:
3266 # FileInput doesn't have a read method, we can't call json.load
3267 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
3268 try:
3269 self.__download_wrapper(self.process_ie_result)(info, download=True)
3270 except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
3271 if not isinstance(e, EntryNotInPlaylist):
3272 self.to_stderr('\r')
3273 webpage_url = info.get('webpage_url')
3274 if webpage_url is not None:
3275 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
3276 return self.download([webpage_url])
3277 else:
3278 raise
3279 return self._download_retcode
3280
3281 @staticmethod
3282 def sanitize_info(info_dict, remove_private_keys=False):
3283 ''' Sanitize the infodict for converting to json '''
3284 if info_dict is None:
3285 return info_dict
3286 info_dict.setdefault('epoch', int(time.time()))
3287 info_dict.setdefault('_type', 'video')
3288
3289 if remove_private_keys:
3290 reject = lambda k, v: v is None or k.startswith('__') or k in {
3291 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
3292 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber',
3293 }
3294 else:
3295 reject = lambda k, v: False
3296
3297 def filter_fn(obj):
3298 if isinstance(obj, dict):
3299 return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
3300 elif isinstance(obj, (list, tuple, set, LazyList)):
3301 return list(map(filter_fn, obj))
3302 elif obj is None or isinstance(obj, (str, int, float, bool)):
3303 return obj
3304 else:
3305 return repr(obj)
3306
3307 return filter_fn(info_dict)
3308
3309 @staticmethod
3310 def filter_requested_info(info_dict, actually_filter=True):
3311 ''' Alias of sanitize_info for backward compatibility '''
3312 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3313
3314 def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
3315 for filename in set(filter(None, files_to_delete)):
3316 if msg:
3317 self.to_screen(msg % filename)
3318 try:
3319 os.remove(filename)
3320 except OSError:
3321 self.report_warning(f'Unable to delete file {filename}')
3322 if filename in info.get('__files_to_move', []): # NB: Delete even if None
3323 del info['__files_to_move'][filename]
3324
3325 @staticmethod
3326 def post_extract(info_dict):
3327 def actual_post_extract(info_dict):
3328 if info_dict.get('_type') in ('playlist', 'multi_video'):
3329 for video_dict in info_dict.get('entries', {}):
3330 actual_post_extract(video_dict or {})
3331 return
3332
3333 post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {})
3334 info_dict.update(post_extractor())
3335
3336 actual_post_extract(info_dict or {})
3337
3338 def run_pp(self, pp, infodict):
3339 files_to_delete = []
3340 if '__files_to_move' not in infodict:
3341 infodict['__files_to_move'] = {}
3342 try:
3343 files_to_delete, infodict = pp.run(infodict)
3344 except PostProcessingError as e:
3345 # Must be True and not 'only_download'
3346 if self.params.get('ignoreerrors') is True:
3347 self.report_error(e)
3348 return infodict
3349 raise
3350
3351 if not files_to_delete:
3352 return infodict
3353 if self.params.get('keepvideo', False):
3354 for f in files_to_delete:
3355 infodict['__files_to_move'].setdefault(f, '')
3356 else:
3357 self._delete_downloaded_files(
3358 *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
3359 return infodict
3360
3361 def run_all_pps(self, key, info, *, additional_pps=None):
3362 self._forceprint(key, info)
3363 for pp in (additional_pps or []) + self._pps[key]:
3364 info = self.run_pp(pp, info)
3365 return info
3366
3367 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3368 info = dict(ie_info)
3369 info['__files_to_move'] = files_to_move or {}
3370 try:
3371 info = self.run_all_pps(key, info)
3372 except PostProcessingError as err:
3373 msg = f'Preprocessing: {err}'
3374 info.setdefault('__pending_error', msg)
3375 self.report_error(msg, is_error=False)
3376 return info, info.pop('__files_to_move', None)
3377
3378 def post_process(self, filename, info, files_to_move=None):
3379 """Run all the postprocessors on the given file."""
3380 info['filepath'] = filename
3381 info['__files_to_move'] = files_to_move or {}
3382 info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
3383 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3384 del info['__files_to_move']
3385 return self.run_all_pps('after_move', info)
3386
3387 def _make_archive_id(self, info_dict):
3388 video_id = info_dict.get('id')
3389 if not video_id:
3390 return
3391 # Future-proof against any change in case
3392 # and backwards compatibility with prior versions
3393 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3394 if extractor is None:
3395 url = str_or_none(info_dict.get('url'))
3396 if not url:
3397 return
3398 # Try to find matching extractor for the URL and take its ie_key
3399 for ie_key, ie in self._ies.items():
3400 if ie.suitable(url):
3401 extractor = ie_key
3402 break
3403 else:
3404 return
3405 return f'{extractor.lower()} {video_id}'
3406
3407 def in_download_archive(self, info_dict):
3408 fn = self.params.get('download_archive')
3409 if fn is None:
3410 return False
3411
3412 vid_id = self._make_archive_id(info_dict)
3413 if not vid_id:
3414 return False # Incomplete video information
3415
3416 return vid_id in self.archive
3417
3418 def record_download_archive(self, info_dict):
3419 fn = self.params.get('download_archive')
3420 if fn is None:
3421 return
3422 vid_id = self._make_archive_id(info_dict)
3423 assert vid_id
3424 self.write_debug(f'Adding to archive: {vid_id}')
3425 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3426 archive_file.write(vid_id + '\n')
3427 self.archive.add(vid_id)
3428
3429 @staticmethod
3430 def format_resolution(format, default='unknown'):
3431 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3432 return 'audio only'
3433 if format.get('resolution') is not None:
3434 return format['resolution']
3435 if format.get('width') and format.get('height'):
3436 return '%dx%d' % (format['width'], format['height'])
3437 elif format.get('height'):
3438 return '%sp' % format['height']
3439 elif format.get('width'):
3440 return '%dx?' % format['width']
3441 return default
3442
3443 def _list_format_headers(self, *headers):
3444 if self.params.get('listformats_table', True) is not False:
3445 return [self._format_out(header, self.Styles.HEADERS) for header in headers]
3446 return headers
3447
3448 def _format_note(self, fdict):
3449 res = ''
3450 if fdict.get('ext') in ['f4f', 'f4m']:
3451 res += '(unsupported)'
3452 if fdict.get('language'):
3453 if res:
3454 res += ' '
3455 res += '[%s]' % fdict['language']
3456 if fdict.get('format_note') is not None:
3457 if res:
3458 res += ' '
3459 res += fdict['format_note']
3460 if fdict.get('tbr') is not None:
3461 if res:
3462 res += ', '
3463 res += '%4dk' % fdict['tbr']
3464 if fdict.get('container') is not None:
3465 if res:
3466 res += ', '
3467 res += '%s container' % fdict['container']
3468 if (fdict.get('vcodec') is not None
3469 and fdict.get('vcodec') != 'none'):
3470 if res:
3471 res += ', '
3472 res += fdict['vcodec']
3473 if fdict.get('vbr') is not None:
3474 res += '@'
3475 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3476 res += 'video@'
3477 if fdict.get('vbr') is not None:
3478 res += '%4dk' % fdict['vbr']
3479 if fdict.get('fps') is not None:
3480 if res:
3481 res += ', '
3482 res += '%sfps' % fdict['fps']
3483 if fdict.get('acodec') is not None:
3484 if res:
3485 res += ', '
3486 if fdict['acodec'] == 'none':
3487 res += 'video only'
3488 else:
3489 res += '%-5s' % fdict['acodec']
3490 elif fdict.get('abr') is not None:
3491 if res:
3492 res += ', '
3493 res += 'audio'
3494 if fdict.get('abr') is not None:
3495 res += '@%3dk' % fdict['abr']
3496 if fdict.get('asr') is not None:
3497 res += ' (%5dHz)' % fdict['asr']
3498 if fdict.get('filesize') is not None:
3499 if res:
3500 res += ', '
3501 res += format_bytes(fdict['filesize'])
3502 elif fdict.get('filesize_approx') is not None:
3503 if res:
3504 res += ', '
3505 res += '~' + format_bytes(fdict['filesize_approx'])
3506 return res
3507
3508 def render_formats_table(self, info_dict):
3509 if not info_dict.get('formats') and not info_dict.get('url'):
3510 return None
3511
3512 formats = info_dict.get('formats', [info_dict])
3513 if not self.params.get('listformats_table', True) is not False:
3514 table = [
3515 [
3516 format_field(f, 'format_id'),
3517 format_field(f, 'ext'),
3518 self.format_resolution(f),
3519 self._format_note(f)
3520 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3521 return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
3522
3523 delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3524 table = [
3525 [
3526 self._format_out(format_field(f, 'format_id'), self.Styles.ID),
3527 format_field(f, 'ext'),
3528 format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
3529 format_field(f, 'fps', '\t%d'),
3530 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3531 delim,
3532 format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
3533 format_field(f, 'tbr', '\t%dk'),
3534 shorten_protocol_name(f.get('protocol', '')),
3535 delim,
3536 format_field(f, 'vcodec', default='unknown').replace(
3537 'none', 'images' if f.get('acodec') == 'none'
3538 else self._format_out('audio only', self.Styles.SUPPRESS)),
3539 format_field(f, 'vbr', '\t%dk'),
3540 format_field(f, 'acodec', default='unknown').replace(
3541 'none', '' if f.get('vcodec') == 'none'
3542 else self._format_out('video only', self.Styles.SUPPRESS)),
3543 format_field(f, 'abr', '\t%dk'),
3544 format_field(f, 'asr', '\t%s', func=format_decimal_suffix),
3545 join_nonempty(
3546 self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
3547 format_field(f, 'language', '[%s]'),
3548 join_nonempty(format_field(f, 'format_note'),
3549 format_field(f, 'container', ignore=(None, f.get('ext'))),
3550 delim=', '),
3551 delim=' '),
3552 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3553 header_line = self._list_format_headers(
3554 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
3555 delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
3556
3557 return render_table(
3558 header_line, table, hide_empty=True,
3559 delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True))
3560
3561 def render_thumbnails_table(self, info_dict):
3562 thumbnails = list(info_dict.get('thumbnails') or [])
3563 if not thumbnails:
3564 return None
3565 return render_table(
3566 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3567 [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
3568
3569 def render_subtitles_table(self, video_id, subtitles):
3570 def _row(lang, formats):
3571 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3572 if len(set(names)) == 1:
3573 names = [] if names[0] == 'unknown' else names[:1]
3574 return [lang, ', '.join(names), ', '.join(exts)]
3575
3576 if not subtitles:
3577 return None
3578 return render_table(
3579 self._list_format_headers('Language', 'Name', 'Formats'),
3580 [_row(lang, formats) for lang, formats in subtitles.items()],
3581 hide_empty=True)
3582
3583 def __list_table(self, video_id, name, func, *args):
3584 table = func(*args)
3585 if not table:
3586 self.to_screen(f'{video_id} has no {name}')
3587 return
3588 self.to_screen(f'[info] Available {name} for {video_id}:')
3589 self.to_stdout(table)
3590
3591 def list_formats(self, info_dict):
3592 self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
3593
3594 def list_thumbnails(self, info_dict):
3595 self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
3596
3597 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3598 self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
3599
3600 def urlopen(self, req):
3601 """ Start an HTTP download """
3602 if isinstance(req, str):
3603 req = sanitized_Request(req)
3604 return self._opener.open(req, timeout=self._socket_timeout)
3605
3606 def print_debug_header(self):
3607 if not self.params.get('verbose'):
3608 return
3609
3610 # These imports can be slow. So import them only as needed
3611 from .extractor.extractors import _LAZY_LOADER
3612 from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors
3613
3614 def get_encoding(stream):
3615 ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
3616 if not supports_terminal_sequences(stream):
3617 from .utils import WINDOWS_VT_MODE # Must be imported locally
3618 ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
3619 return ret
3620
3621 encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % (
3622 locale.getpreferredencoding(),
3623 sys.getfilesystemencoding(),
3624 self.get_encoding(),
3625 ', '.join(
3626 f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_
3627 if stream is not None and key != 'console')
3628 )
3629
3630 logger = self.params.get('logger')
3631 if logger:
3632 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
3633 write_debug(encoding_str)
3634 else:
3635 write_string(f'[debug] {encoding_str}\n', encoding=None)
3636 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
3637
3638 source = detect_variant()
3639 write_debug(join_nonempty(
3640 'yt-dlp version', __version__,
3641 f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
3642 '' if source == 'unknown' else f'({source})',
3643 delim=' '))
3644 if not _LAZY_LOADER:
3645 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
3646 write_debug('Lazy loading extractors is forcibly disabled')
3647 else:
3648 write_debug('Lazy loading extractors is disabled')
3649 if plugin_extractors or plugin_postprocessors:
3650 write_debug('Plugins: %s' % [
3651 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3652 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3653 if self.params['compat_opts']:
3654 write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts']))
3655
3656 if source == 'source':
3657 try:
3658 stdout, _, _ = Popen.run(
3659 ['git', 'rev-parse', '--short', 'HEAD'],
3660 text=True, cwd=os.path.dirname(os.path.abspath(__file__)),
3661 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3662 if re.fullmatch('[0-9a-f]+', stdout.strip()):
3663 write_debug(f'Git HEAD: {stdout.strip()}')
3664 except Exception:
3665 with contextlib.suppress(Exception):
3666 sys.exc_clear()
3667
3668 write_debug(system_identifier())
3669
3670 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
3671 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
3672 if ffmpeg_features:
3673 exe_versions['ffmpeg'] += ' (%s)' % ','.join(sorted(ffmpeg_features))
3674
3675 exe_versions['rtmpdump'] = rtmpdump_version()
3676 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3677 exe_str = ', '.join(
3678 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3679 ) or 'none'
3680 write_debug('exe versions: %s' % exe_str)
3681
3682 from .compat.compat_utils import get_package_info
3683 from .dependencies import available_dependencies
3684
3685 write_debug('Optional libraries: %s' % (', '.join(sorted({
3686 join_nonempty(*get_package_info(m)) for m in available_dependencies.values()
3687 })) or 'none'))
3688
3689 self._setup_opener()
3690 proxy_map = {}
3691 for handler in self._opener.handlers:
3692 if hasattr(handler, 'proxies'):
3693 proxy_map.update(handler.proxies)
3694 write_debug(f'Proxy map: {proxy_map}')
3695
3696 # Not implemented
3697 if False and self.params.get('call_home'):
3698 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode()
3699 write_debug('Public IP address: %s' % ipaddr)
3700 latest_version = self.urlopen(
3701 'https://yt-dl.org/latest/version').read().decode()
3702 if version_tuple(latest_version) > version_tuple(__version__):
3703 self.report_warning(
3704 'You are using an outdated version (newest version: %s)! '
3705 'See https://yt-dl.org/update if you need help updating.' %
3706 latest_version)
3707
3708 def _setup_opener(self):
3709 if hasattr(self, '_opener'):
3710 return
3711 timeout_val = self.params.get('socket_timeout')
3712 self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
3713
3714 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3715 opts_cookiefile = self.params.get('cookiefile')
3716 opts_proxy = self.params.get('proxy')
3717
3718 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3719
3720 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3721 if opts_proxy is not None:
3722 if opts_proxy == '':
3723 proxies = {}
3724 else:
3725 proxies = {'http': opts_proxy, 'https': opts_proxy}
3726 else:
3727 proxies = urllib.request.getproxies()
3728 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3729 if 'http' in proxies and 'https' not in proxies:
3730 proxies['https'] = proxies['http']
3731 proxy_handler = PerRequestProxyHandler(proxies)
3732
3733 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3734 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3735 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3736 redirect_handler = YoutubeDLRedirectHandler()
3737 data_handler = urllib.request.DataHandler()
3738
3739 # When passing our own FileHandler instance, build_opener won't add the
3740 # default FileHandler and allows us to disable the file protocol, which
3741 # can be used for malicious purposes (see
3742 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3743 file_handler = urllib.request.FileHandler()
3744
3745 def file_open(*args, **kwargs):
3746 raise urllib.error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3747 file_handler.file_open = file_open
3748
3749 opener = urllib.request.build_opener(
3750 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3751
3752 # Delete the default user-agent header, which would otherwise apply in
3753 # cases where our custom HTTP handler doesn't come into play
3754 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3755 opener.addheaders = []
3756 self._opener = opener
3757
3758 def encode(self, s):
3759 if isinstance(s, bytes):
3760 return s # Already encoded
3761
3762 try:
3763 return s.encode(self.get_encoding())
3764 except UnicodeEncodeError as err:
3765 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3766 raise
3767
3768 def get_encoding(self):
3769 encoding = self.params.get('encoding')
3770 if encoding is None:
3771 encoding = preferredencoding()
3772 return encoding
3773
3774 def _write_info_json(self, label, ie_result, infofn, overwrite=None):
3775 ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error '''
3776 if overwrite is None:
3777 overwrite = self.params.get('overwrites', True)
3778 if not self.params.get('writeinfojson'):
3779 return False
3780 elif not infofn:
3781 self.write_debug(f'Skipping writing {label} infojson')
3782 return False
3783 elif not self._ensure_dir_exists(infofn):
3784 return None
3785 elif not overwrite and os.path.exists(infofn):
3786 self.to_screen(f'[info] {label.title()} metadata is already present')
3787 return 'exists'
3788
3789 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3790 try:
3791 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3792 return True
3793 except OSError:
3794 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3795 return None
3796
3797 def _write_description(self, label, ie_result, descfn):
3798 ''' Write description and returns True = written, False = skip, None = error '''
3799 if not self.params.get('writedescription'):
3800 return False
3801 elif not descfn:
3802 self.write_debug(f'Skipping writing {label} description')
3803 return False
3804 elif not self._ensure_dir_exists(descfn):
3805 return None
3806 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3807 self.to_screen(f'[info] {label.title()} description is already present')
3808 elif ie_result.get('description') is None:
3809 self.report_warning(f'There\'s no {label} description to write')
3810 return False
3811 else:
3812 try:
3813 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3814 with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3815 descfile.write(ie_result['description'])
3816 except OSError:
3817 self.report_error(f'Cannot write {label} description file {descfn}')
3818 return None
3819 return True
3820
3821 def _write_subtitles(self, info_dict, filename):
3822 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3823 ret = []
3824 subtitles = info_dict.get('requested_subtitles')
3825 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3826 # subtitles download errors are already managed as troubles in relevant IE
3827 # that way it will silently go on when used with unsupporting IE
3828 return ret
3829
3830 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3831 if not sub_filename_base:
3832 self.to_screen('[info] Skipping writing video subtitles')
3833 return ret
3834 for sub_lang, sub_info in subtitles.items():
3835 sub_format = sub_info['ext']
3836 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3837 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3838 existing_sub = self.existing_file((sub_filename_final, sub_filename))
3839 if existing_sub:
3840 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3841 sub_info['filepath'] = existing_sub
3842 ret.append((existing_sub, sub_filename_final))
3843 continue
3844
3845 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3846 if sub_info.get('data') is not None:
3847 try:
3848 # Use newline='' to prevent conversion of newline characters
3849 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3850 with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3851 subfile.write(sub_info['data'])
3852 sub_info['filepath'] = sub_filename
3853 ret.append((sub_filename, sub_filename_final))
3854 continue
3855 except OSError:
3856 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3857 return None
3858
3859 try:
3860 sub_copy = sub_info.copy()
3861 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3862 self.dl(sub_filename, sub_copy, subtitle=True)
3863 sub_info['filepath'] = sub_filename
3864 ret.append((sub_filename, sub_filename_final))
3865 except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3866 msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
3867 if self.params.get('ignoreerrors') is not True: # False or 'only_download'
3868 if not self.params.get('ignoreerrors'):
3869 self.report_error(msg)
3870 raise DownloadError(msg)
3871 self.report_warning(msg)
3872 return ret
3873
3874 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3875 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3876 write_all = self.params.get('write_all_thumbnails', False)
3877 thumbnails, ret = [], []
3878 if write_all or self.params.get('writethumbnail', False):
3879 thumbnails = info_dict.get('thumbnails') or []
3880 multiple = write_all and len(thumbnails) > 1
3881
3882 if thumb_filename_base is None:
3883 thumb_filename_base = filename
3884 if thumbnails and not thumb_filename_base:
3885 self.write_debug(f'Skipping writing {label} thumbnail')
3886 return ret
3887
3888 for idx, t in list(enumerate(thumbnails))[::-1]:
3889 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3890 thumb_display_id = f'{label} thumbnail {t["id"]}'
3891 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3892 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3893
3894 existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
3895 if existing_thumb:
3896 self.to_screen('[info] %s is already present' % (
3897 thumb_display_id if multiple else f'{label} thumbnail').capitalize())
3898 t['filepath'] = existing_thumb
3899 ret.append((existing_thumb, thumb_filename_final))
3900 else:
3901 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3902 try:
3903 uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {})))
3904 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3905 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3906 shutil.copyfileobj(uf, thumbf)
3907 ret.append((thumb_filename, thumb_filename_final))
3908 t['filepath'] = thumb_filename
3909 except network_exceptions as err:
3910 thumbnails.pop(idx)
3911 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3912 if ret and not write_all:
3913 break
3914 return ret