]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
[extractor/crunchyroll] Improve _VALID_URL
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 import collections
2 import contextlib
3 import datetime
4 import errno
5 import fileinput
6 import functools
7 import io
8 import itertools
9 import json
10 import locale
11 import operator
12 import os
13 import random
14 import re
15 import shutil
16 import subprocess
17 import sys
18 import tempfile
19 import time
20 import tokenize
21 import traceback
22 import unicodedata
23 import urllib.request
24 from string import ascii_letters
25
26 from .cache import Cache
27 from .compat import HAS_LEGACY as compat_has_legacy
28 from .compat import compat_os_name, compat_shlex_quote
29 from .cookies import load_cookies
30 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
31 from .downloader.rtmp import rtmpdump_version
32 from .extractor import gen_extractor_classes, get_info_extractor
33 from .extractor.openload import PhantomJSwrapper
34 from .minicurses import format_text
35 from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
36 from .postprocessor import (
37 EmbedThumbnailPP,
38 FFmpegFixupDuplicateMoovPP,
39 FFmpegFixupDurationPP,
40 FFmpegFixupM3u8PP,
41 FFmpegFixupM4aPP,
42 FFmpegFixupStretchedPP,
43 FFmpegFixupTimestampPP,
44 FFmpegMergerPP,
45 FFmpegPostProcessor,
46 FFmpegVideoConvertorPP,
47 MoveFilesAfterDownloadPP,
48 get_postprocessor,
49 )
50 from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping
51 from .update import detect_variant
52 from .utils import (
53 DEFAULT_OUTTMPL,
54 IDENTITY,
55 LINK_TEMPLATES,
56 NO_DEFAULT,
57 NUMBER_RE,
58 OUTTMPL_TYPES,
59 POSTPROCESS_WHEN,
60 STR_FORMAT_RE_TMPL,
61 STR_FORMAT_TYPES,
62 ContentTooShortError,
63 DateRange,
64 DownloadCancelled,
65 DownloadError,
66 EntryNotInPlaylist,
67 ExistingVideoReached,
68 ExtractorError,
69 GeoRestrictedError,
70 HEADRequest,
71 ISO3166Utils,
72 LazyList,
73 MaxDownloadsReached,
74 Namespace,
75 PagedList,
76 PerRequestProxyHandler,
77 PlaylistEntries,
78 Popen,
79 PostProcessingError,
80 ReExtractInfo,
81 RejectedVideoReached,
82 SameFileError,
83 UnavailableVideoError,
84 YoutubeDLCookieProcessor,
85 YoutubeDLHandler,
86 YoutubeDLRedirectHandler,
87 age_restricted,
88 args_to_str,
89 bug_reports_message,
90 date_from_str,
91 determine_ext,
92 determine_protocol,
93 encode_compat_str,
94 encodeFilename,
95 error_to_compat_str,
96 escapeHTML,
97 expand_path,
98 filter_dict,
99 float_or_none,
100 format_bytes,
101 format_decimal_suffix,
102 format_field,
103 formatSeconds,
104 get_domain,
105 int_or_none,
106 iri_to_uri,
107 join_nonempty,
108 locked_file,
109 make_dir,
110 make_HTTPS_handler,
111 merge_headers,
112 network_exceptions,
113 number_of_digits,
114 orderedSet,
115 parse_filesize,
116 preferredencoding,
117 prepend_extension,
118 register_socks_protocols,
119 remove_terminal_sequences,
120 render_table,
121 replace_extension,
122 sanitize_filename,
123 sanitize_path,
124 sanitize_url,
125 sanitized_Request,
126 std_headers,
127 str_or_none,
128 strftime_or_none,
129 subtitles_filename,
130 supports_terminal_sequences,
131 system_identifier,
132 timetuple_from_msec,
133 to_high_limit_path,
134 traverse_obj,
135 try_get,
136 url_basename,
137 variadic,
138 version_tuple,
139 windows_enable_vt_mode,
140 write_json_file,
141 write_string,
142 )
143 from .version import RELEASE_GIT_HEAD, __version__
144
145 if compat_os_name == 'nt':
146 import ctypes
147
148
149 class YoutubeDL:
150 """YoutubeDL class.
151
152 YoutubeDL objects are the ones responsible of downloading the
153 actual video file and writing it to disk if the user has requested
154 it, among some other tasks. In most cases there should be one per
155 program. As, given a video URL, the downloader doesn't know how to
156 extract all the needed information, task that InfoExtractors do, it
157 has to pass the URL to one of them.
158
159 For this, YoutubeDL objects have a method that allows
160 InfoExtractors to be registered in a given order. When it is passed
161 a URL, the YoutubeDL object handles it to the first InfoExtractor it
162 finds that reports being able to handle it. The InfoExtractor extracts
163 all the information about the video or videos the URL refers to, and
164 YoutubeDL process the extracted information, possibly using a File
165 Downloader to download the video.
166
167 YoutubeDL objects accept a lot of parameters. In order not to saturate
168 the object constructor with arguments, it receives a dictionary of
169 options instead. These options are available through the params
170 attribute for the InfoExtractors to use. The YoutubeDL also
171 registers itself as the downloader in charge for the InfoExtractors
172 that are added to it, so this is a "mutual registration".
173
174 Available options:
175
176 username: Username for authentication purposes.
177 password: Password for authentication purposes.
178 videopassword: Password for accessing a video.
179 ap_mso: Adobe Pass multiple-system operator identifier.
180 ap_username: Multiple-system operator account username.
181 ap_password: Multiple-system operator account password.
182 usenetrc: Use netrc for authentication instead.
183 verbose: Print additional info to stdout.
184 quiet: Do not print messages to stdout.
185 no_warnings: Do not print out anything for warnings.
186 forceprint: A dict with keys WHEN mapped to a list of templates to
187 print to stdout. The allowed keys are video or any of the
188 items in utils.POSTPROCESS_WHEN.
189 For compatibility, a single list is also accepted
190 print_to_file: A dict with keys WHEN (same as forceprint) mapped to
191 a list of tuples with (template, filename)
192 forcejson: Force printing info_dict as JSON.
193 dump_single_json: Force printing the info_dict of the whole playlist
194 (or video) as a single JSON line.
195 force_write_download_archive: Force writing download archive regardless
196 of 'skip_download' or 'simulate'.
197 simulate: Do not download the video files. If unset (or None),
198 simulate only if listsubtitles, listformats or list_thumbnails is used
199 format: Video format code. see "FORMAT SELECTION" for more details.
200 You can also pass a function. The function takes 'ctx' as
201 argument and returns the formats to download.
202 See "build_format_selector" for an implementation
203 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
204 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
205 extracting metadata even if the video is not actually
206 available for download (experimental)
207 format_sort: A list of fields by which to sort the video formats.
208 See "Sorting Formats" for more details.
209 format_sort_force: Force the given format_sort. see "Sorting Formats"
210 for more details.
211 prefer_free_formats: Whether to prefer video formats with free containers
212 over non-free ones of same quality.
213 allow_multiple_video_streams: Allow multiple video streams to be merged
214 into a single file
215 allow_multiple_audio_streams: Allow multiple audio streams to be merged
216 into a single file
217 check_formats Whether to test if the formats are downloadable.
218 Can be True (check all), False (check none),
219 'selected' (check selected formats),
220 or None (check only if requested by extractor)
221 paths: Dictionary of output paths. The allowed keys are 'home'
222 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
223 outtmpl: Dictionary of templates for output names. Allowed keys
224 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
225 For compatibility with youtube-dl, a single string can also be used
226 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
227 restrictfilenames: Do not allow "&" and spaces in file names
228 trim_file_name: Limit length of filename (extension excluded)
229 windowsfilenames: Force the filenames to be windows compatible
230 ignoreerrors: Do not stop on download/postprocessing errors.
231 Can be 'only_download' to ignore only download errors.
232 Default is 'only_download' for CLI, but False for API
233 skip_playlist_after_errors: Number of allowed failures until the rest of
234 the playlist is skipped
235 force_generic_extractor: Force downloader to use the generic extractor
236 overwrites: Overwrite all video and metadata files if True,
237 overwrite only non-video files if None
238 and don't overwrite any file if False
239 For compatibility with youtube-dl,
240 "nooverwrites" may also be used instead
241 playlist_items: Specific indices of playlist to download.
242 playlistrandom: Download playlist items in random order.
243 lazy_playlist: Process playlist entries as they are received.
244 matchtitle: Download only matching titles.
245 rejecttitle: Reject downloads for matching titles.
246 logger: Log messages to a logging.Logger instance.
247 logtostderr: Log messages to stderr instead of stdout.
248 consoletitle: Display progress in console window's titlebar.
249 writedescription: Write the video description to a .description file
250 writeinfojson: Write the video description to a .info.json file
251 clean_infojson: Remove private fields from the infojson
252 getcomments: Extract video comments. This will not be written to disk
253 unless writeinfojson is also given
254 writeannotations: Write the video annotations to a .annotations.xml file
255 writethumbnail: Write the thumbnail image to a file
256 allow_playlist_files: Whether to write playlists' description, infojson etc
257 also to disk when using the 'write*' options
258 write_all_thumbnails: Write all thumbnail formats to files
259 writelink: Write an internet shortcut file, depending on the
260 current platform (.url/.webloc/.desktop)
261 writeurllink: Write a Windows internet shortcut file (.url)
262 writewebloclink: Write a macOS internet shortcut file (.webloc)
263 writedesktoplink: Write a Linux internet shortcut file (.desktop)
264 writesubtitles: Write the video subtitles to a file
265 writeautomaticsub: Write the automatically generated subtitles to a file
266 listsubtitles: Lists all available subtitles for the video
267 subtitlesformat: The format code for subtitles
268 subtitleslangs: List of languages of the subtitles to download (can be regex).
269 The list may contain "all" to refer to all the available
270 subtitles. The language can be prefixed with a "-" to
271 exclude it from the requested languages. Eg: ['all', '-live_chat']
272 keepvideo: Keep the video file after post-processing
273 daterange: A DateRange object, download only if the upload_date is in the range.
274 skip_download: Skip the actual download of the video file
275 cachedir: Location of the cache files in the filesystem.
276 False to disable filesystem cache.
277 noplaylist: Download single video instead of a playlist if in doubt.
278 age_limit: An integer representing the user's age in years.
279 Unsuitable videos for the given age are skipped.
280 min_views: An integer representing the minimum view count the video
281 must have in order to not be skipped.
282 Videos without view count information are always
283 downloaded. None for no limit.
284 max_views: An integer representing the maximum view count.
285 Videos that are more popular than that are not
286 downloaded.
287 Videos without view count information are always
288 downloaded. None for no limit.
289 download_archive: File name of a file where all downloads are recorded.
290 Videos already present in the file are not downloaded
291 again.
292 break_on_existing: Stop the download process after attempting to download a
293 file that is in the archive.
294 break_on_reject: Stop the download process when encountering a video that
295 has been filtered out.
296 break_per_url: Whether break_on_reject and break_on_existing
297 should act on each input URL as opposed to for the entire queue
298 cookiefile: File name or text stream from where cookies should be read and dumped to
299 cookiesfrombrowser: A tuple containing the name of the browser, the profile
300 name/pathfrom where cookies are loaded, and the name of the
301 keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
302 legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
303 support RFC 5746 secure renegotiation
304 nocheckcertificate: Do not verify SSL certificates
305 client_certificate: Path to client certificate file in PEM format. May include the private key
306 client_certificate_key: Path to private key file for client certificate
307 client_certificate_password: Password for client certificate private key, if encrypted.
308 If not provided and the key is encrypted, yt-dlp will ask interactively
309 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
310 At the moment, this is only supported by YouTube.
311 http_headers: A dictionary of custom headers to be used for all requests
312 proxy: URL of the proxy server to use
313 geo_verification_proxy: URL of the proxy to use for IP address verification
314 on geo-restricted sites.
315 socket_timeout: Time to wait for unresponsive hosts, in seconds
316 bidi_workaround: Work around buggy terminals without bidirectional text
317 support, using fridibi
318 debug_printtraffic:Print out sent and received HTTP traffic
319 default_search: Prepend this string if an input url is not valid.
320 'auto' for elaborate guessing
321 encoding: Use this encoding instead of the system-specified.
322 extract_flat: Do not resolve URLs, return the immediate result.
323 Pass in 'in_playlist' to only show this behavior for
324 playlist items.
325 wait_for_video: If given, wait for scheduled streams to become available.
326 The value should be a tuple containing the range
327 (min_secs, max_secs) to wait between retries
328 postprocessors: A list of dictionaries, each with an entry
329 * key: The name of the postprocessor. See
330 yt_dlp/postprocessor/__init__.py for a list.
331 * when: When to run the postprocessor. Allowed values are
332 the entries of utils.POSTPROCESS_WHEN
333 Assumed to be 'post_process' if not given
334 progress_hooks: A list of functions that get called on download
335 progress, with a dictionary with the entries
336 * status: One of "downloading", "error", or "finished".
337 Check this first and ignore unknown values.
338 * info_dict: The extracted info_dict
339
340 If status is one of "downloading", or "finished", the
341 following properties may also be present:
342 * filename: The final filename (always present)
343 * tmpfilename: The filename we're currently writing to
344 * downloaded_bytes: Bytes on disk
345 * total_bytes: Size of the whole file, None if unknown
346 * total_bytes_estimate: Guess of the eventual file size,
347 None if unavailable.
348 * elapsed: The number of seconds since download started.
349 * eta: The estimated time in seconds, None if unknown
350 * speed: The download speed in bytes/second, None if
351 unknown
352 * fragment_index: The counter of the currently
353 downloaded video fragment.
354 * fragment_count: The number of fragments (= individual
355 files that will be merged)
356
357 Progress hooks are guaranteed to be called at least once
358 (with status "finished") if the download is successful.
359 postprocessor_hooks: A list of functions that get called on postprocessing
360 progress, with a dictionary with the entries
361 * status: One of "started", "processing", or "finished".
362 Check this first and ignore unknown values.
363 * postprocessor: Name of the postprocessor
364 * info_dict: The extracted info_dict
365
366 Progress hooks are guaranteed to be called at least twice
367 (with status "started" and "finished") if the processing is successful.
368 merge_output_format: Extension to use when merging formats.
369 final_ext: Expected final extension; used to detect when the file was
370 already downloaded and converted
371 fixup: Automatically correct known faults of the file.
372 One of:
373 - "never": do nothing
374 - "warn": only emit a warning
375 - "detect_or_warn": check whether we can do anything
376 about it, warn otherwise (default)
377 source_address: Client-side IP address to bind to.
378 sleep_interval_requests: Number of seconds to sleep between requests
379 during extraction
380 sleep_interval: Number of seconds to sleep before each download when
381 used alone or a lower bound of a range for randomized
382 sleep before each download (minimum possible number
383 of seconds to sleep) when used along with
384 max_sleep_interval.
385 max_sleep_interval:Upper bound of a range for randomized sleep before each
386 download (maximum possible number of seconds to sleep).
387 Must only be used along with sleep_interval.
388 Actual sleep time will be a random float from range
389 [sleep_interval; max_sleep_interval].
390 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
391 listformats: Print an overview of available video formats and exit.
392 list_thumbnails: Print a table of all thumbnails and exit.
393 match_filter: A function that gets called for every video with the signature
394 (info_dict, *, incomplete: bool) -> Optional[str]
395 For backward compatibility with youtube-dl, the signature
396 (info_dict) -> Optional[str] is also allowed.
397 - If it returns a message, the video is ignored.
398 - If it returns None, the video is downloaded.
399 - If it returns utils.NO_DEFAULT, the user is interactively
400 asked whether to download the video.
401 match_filter_func in utils.py is one example for this.
402 no_color: Do not emit color codes in output.
403 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
404 HTTP header
405 geo_bypass_country:
406 Two-letter ISO 3166-2 country code that will be used for
407 explicit geographic restriction bypassing via faking
408 X-Forwarded-For HTTP header
409 geo_bypass_ip_block:
410 IP range in CIDR notation that will be used similarly to
411 geo_bypass_country
412 external_downloader: A dictionary of protocol keys and the executable of the
413 external downloader to use for it. The allowed protocols
414 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
415 Set the value to 'native' to use the native downloader
416 compat_opts: Compatibility options. See "Differences in default behavior".
417 The following options do not work when used through the API:
418 filename, abort-on-error, multistreams, no-live-chat, format-sort
419 no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json.
420 Refer __init__.py for their implementation
421 progress_template: Dictionary of templates for progress outputs.
422 Allowed keys are 'download', 'postprocess',
423 'download-title' (console title) and 'postprocess-title'.
424 The template is mapped on a dictionary with keys 'progress' and 'info'
425 retry_sleep_functions: Dictionary of functions that takes the number of attempts
426 as argument and returns the time to sleep in seconds.
427 Allowed keys are 'http', 'fragment', 'file_access'
428 download_ranges: A function that gets called for every video with the signature
429 (info_dict, *, ydl) -> Iterable[Section].
430 Only the returned sections will be downloaded. Each Section contains:
431 * start_time: Start time of the section in seconds
432 * end_time: End time of the section in seconds
433 * title: Section title (Optional)
434 * index: Section number (Optional)
435
436 The following parameters are not used by YoutubeDL itself, they are used by
437 the downloader (see yt_dlp/downloader/common.py):
438 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
439 max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
440 continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
441 external_downloader_args, concurrent_fragment_downloads.
442
443 The following options are used by the post processors:
444 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
445 to the binary or its containing directory.
446 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
447 and a list of additional command-line arguments for the
448 postprocessor/executable. The dict can also have "PP+EXE" keys
449 which are used when the given exe is used by the given PP.
450 Use 'default' as the name for arguments to passed to all PP
451 For compatibility with youtube-dl, a single list of args
452 can also be used
453
454 The following options are used by the extractors:
455 extractor_retries: Number of times to retry for known errors
456 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
457 hls_split_discontinuity: Split HLS playlists to different formats at
458 discontinuities such as ad breaks (default: False)
459 extractor_args: A dictionary of arguments to be passed to the extractors.
460 See "EXTRACTOR ARGUMENTS" for details.
461 Eg: {'youtube': {'skip': ['dash', 'hls']}}
462 mark_watched: Mark videos watched (even with --simulate). Only for YouTube
463
464 The following options are deprecated and may be removed in the future:
465
466 playliststart: - Use playlist_items
467 Playlist item to start at.
468 playlistend: - Use playlist_items
469 Playlist item to end at.
470 playlistreverse: - Use playlist_items
471 Download playlist items in reverse order.
472 forceurl: - Use forceprint
473 Force printing final URL.
474 forcetitle: - Use forceprint
475 Force printing title.
476 forceid: - Use forceprint
477 Force printing ID.
478 forcethumbnail: - Use forceprint
479 Force printing thumbnail URL.
480 forcedescription: - Use forceprint
481 Force printing description.
482 forcefilename: - Use forceprint
483 Force printing final filename.
484 forceduration: - Use forceprint
485 Force printing duration.
486 allsubtitles: - Use subtitleslangs = ['all']
487 Downloads all the subtitles of the video
488 (requires writesubtitles or writeautomaticsub)
489 include_ads: - Doesn't work
490 Download ads as well
491 call_home: - Not implemented
492 Boolean, true iff we are allowed to contact the
493 yt-dlp servers for debugging.
494 post_hooks: - Register a custom postprocessor
495 A list of functions that get called as the final step
496 for each video file, after all postprocessors have been
497 called. The filename will be passed as the only argument.
498 hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}.
499 Use the native HLS downloader instead of ffmpeg/avconv
500 if True, otherwise use ffmpeg/avconv if False, otherwise
501 use downloader suggested by extractor if None.
502 prefer_ffmpeg: - avconv support is deprecated
503 If False, use avconv instead of ffmpeg if both are available,
504 otherwise prefer ffmpeg.
505 youtube_include_dash_manifest: - Use extractor_args
506 If True (default), DASH manifests and related
507 data will be downloaded and processed by extractor.
508 You can reduce network I/O by disabling it if you don't
509 care about DASH. (only for youtube)
510 youtube_include_hls_manifest: - Use extractor_args
511 If True (default), HLS manifests and related
512 data will be downloaded and processed by extractor.
513 You can reduce network I/O by disabling it if you don't
514 care about HLS. (only for youtube)
515 """
516
517 _NUMERIC_FIELDS = {
518 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
519 'timestamp', 'release_timestamp',
520 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
521 'average_rating', 'comment_count', 'age_limit',
522 'start_time', 'end_time',
523 'chapter_number', 'season_number', 'episode_number',
524 'track_number', 'disc_number', 'release_year',
525 }
526
527 _format_fields = {
528 # NB: Keep in sync with the docstring of extractor/common.py
529 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
530 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr',
531 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
532 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
533 'preference', 'language', 'language_preference', 'quality', 'source_preference',
534 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
535 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
536 }
537 _format_selection_exts = {
538 'audio': {'m4a', 'mp3', 'ogg', 'aac'},
539 'video': {'mp4', 'flv', 'webm', '3gp'},
540 'storyboards': {'mhtml'},
541 }
542
543 def __init__(self, params=None, auto_init=True):
544 """Create a FileDownloader object with the given options.
545 @param auto_init Whether to load the default extractors and print header (if verbose).
546 Set to 'no_verbose_header' to not print the header
547 """
548 if params is None:
549 params = {}
550 self.params = params
551 self._ies = {}
552 self._ies_instances = {}
553 self._pps = {k: [] for k in POSTPROCESS_WHEN}
554 self._printed_messages = set()
555 self._first_webpage_request = True
556 self._post_hooks = []
557 self._progress_hooks = []
558 self._postprocessor_hooks = []
559 self._download_retcode = 0
560 self._num_downloads = 0
561 self._num_videos = 0
562 self._playlist_level = 0
563 self._playlist_urls = set()
564 self.cache = Cache(self)
565
566 windows_enable_vt_mode()
567 stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
568 self._out_files = Namespace(
569 out=stdout,
570 error=sys.stderr,
571 screen=sys.stderr if self.params.get('quiet') else stdout,
572 console=None if compat_os_name == 'nt' else next(
573 filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None)
574 )
575 self._allow_colors = Namespace(**{
576 type_: not self.params.get('no_color') and supports_terminal_sequences(stream)
577 for type_, stream in self._out_files.items_ if type_ != 'console'
578 })
579
580 MIN_SUPPORTED, MIN_RECOMMENDED = (3, 6), (3, 7)
581 current_version = sys.version_info[:2]
582 if current_version < MIN_RECOMMENDED:
583 msg = ('Support for Python version %d.%d has been deprecated. '
584 'See https://github.com/yt-dlp/yt-dlp/issues/3764 for more details. '
585 'You will recieve only one more update on this version')
586 if current_version < MIN_SUPPORTED:
587 msg = 'Python version %d.%d is no longer supported'
588 self.deprecation_warning(
589 f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED))
590
591 if self.params.get('allow_unplayable_formats'):
592 self.report_warning(
593 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
594 'This is a developer option intended for debugging. \n'
595 ' If you experience any issues while using this option, '
596 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
597
598 def check_deprecated(param, option, suggestion):
599 if self.params.get(param) is not None:
600 self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
601 return True
602 return False
603
604 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
605 if self.params.get('geo_verification_proxy') is None:
606 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
607
608 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
609 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
610 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
611
612 for msg in self.params.get('_warnings', []):
613 self.report_warning(msg)
614 for msg in self.params.get('_deprecation_warnings', []):
615 self.deprecation_warning(msg)
616
617 self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
618 if not compat_has_legacy:
619 self.params['compat_opts'].add('no-compat-legacy')
620 if 'list-formats' in self.params['compat_opts']:
621 self.params['listformats_table'] = False
622
623 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
624 # nooverwrites was unnecessarily changed to overwrites
625 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
626 # This ensures compatibility with both keys
627 self.params['overwrites'] = not self.params['nooverwrites']
628 elif self.params.get('overwrites') is None:
629 self.params.pop('overwrites', None)
630 else:
631 self.params['nooverwrites'] = not self.params['overwrites']
632
633 self.params.setdefault('forceprint', {})
634 self.params.setdefault('print_to_file', {})
635
636 # Compatibility with older syntax
637 if not isinstance(params['forceprint'], dict):
638 self.params['forceprint'] = {'video': params['forceprint']}
639
640 if self.params.get('bidi_workaround', False):
641 try:
642 import pty
643 master, slave = pty.openpty()
644 width = shutil.get_terminal_size().columns
645 width_args = [] if width is None else ['-w', str(width)]
646 sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
647 try:
648 self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
649 except OSError:
650 self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
651 self._output_channel = os.fdopen(master, 'rb')
652 except OSError as ose:
653 if ose.errno == errno.ENOENT:
654 self.report_warning(
655 'Could not find fribidi executable, ignoring --bidi-workaround. '
656 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
657 else:
658 raise
659
660 if auto_init:
661 if auto_init != 'no_verbose_header':
662 self.print_debug_header()
663 self.add_default_info_extractors()
664
665 if (sys.platform != 'win32'
666 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
667 and not self.params.get('restrictfilenames', False)):
668 # Unicode filesystem API will throw errors (#1474, #13027)
669 self.report_warning(
670 'Assuming --restrict-filenames since file system encoding '
671 'cannot encode all characters. '
672 'Set the LC_ALL environment variable to fix this.')
673 self.params['restrictfilenames'] = True
674
675 self._parse_outtmpl()
676
677 # Creating format selector here allows us to catch syntax errors before the extraction
678 self.format_selector = (
679 self.params.get('format') if self.params.get('format') in (None, '-')
680 else self.params['format'] if callable(self.params['format'])
681 else self.build_format_selector(self.params['format']))
682
683 # Set http_headers defaults according to std_headers
684 self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
685
686 hooks = {
687 'post_hooks': self.add_post_hook,
688 'progress_hooks': self.add_progress_hook,
689 'postprocessor_hooks': self.add_postprocessor_hook,
690 }
691 for opt, fn in hooks.items():
692 for ph in self.params.get(opt, []):
693 fn(ph)
694
695 for pp_def_raw in self.params.get('postprocessors', []):
696 pp_def = dict(pp_def_raw)
697 when = pp_def.pop('when', 'post_process')
698 self.add_post_processor(
699 get_postprocessor(pp_def.pop('key'))(self, **pp_def),
700 when=when)
701
702 self._setup_opener()
703 register_socks_protocols()
704
705 def preload_download_archive(fn):
706 """Preload the archive, if any is specified"""
707 if fn is None:
708 return False
709 self.write_debug(f'Loading archive file {fn!r}')
710 try:
711 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
712 for line in archive_file:
713 self.archive.add(line.strip())
714 except OSError as ioe:
715 if ioe.errno != errno.ENOENT:
716 raise
717 return False
718 return True
719
720 self.archive = set()
721 preload_download_archive(self.params.get('download_archive'))
722
723 def warn_if_short_id(self, argv):
724 # short YouTube ID starting with dash?
725 idxs = [
726 i for i, a in enumerate(argv)
727 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
728 if idxs:
729 correct_argv = (
730 ['yt-dlp']
731 + [a for i, a in enumerate(argv) if i not in idxs]
732 + ['--'] + [argv[i] for i in idxs]
733 )
734 self.report_warning(
735 'Long argument string detected. '
736 'Use -- to separate parameters and URLs, like this:\n%s' %
737 args_to_str(correct_argv))
738
739 def add_info_extractor(self, ie):
740 """Add an InfoExtractor object to the end of the list."""
741 ie_key = ie.ie_key()
742 self._ies[ie_key] = ie
743 if not isinstance(ie, type):
744 self._ies_instances[ie_key] = ie
745 ie.set_downloader(self)
746
747 def _get_info_extractor_class(self, ie_key):
748 ie = self._ies.get(ie_key)
749 if ie is None:
750 ie = get_info_extractor(ie_key)
751 self.add_info_extractor(ie)
752 return ie
753
754 def get_info_extractor(self, ie_key):
755 """
756 Get an instance of an IE with name ie_key, it will try to get one from
757 the _ies list, if there's no instance it will create a new one and add
758 it to the extractor list.
759 """
760 ie = self._ies_instances.get(ie_key)
761 if ie is None:
762 ie = get_info_extractor(ie_key)()
763 self.add_info_extractor(ie)
764 return ie
765
766 def add_default_info_extractors(self):
767 """
768 Add the InfoExtractors returned by gen_extractors to the end of the list
769 """
770 for ie in gen_extractor_classes():
771 self.add_info_extractor(ie)
772
773 def add_post_processor(self, pp, when='post_process'):
774 """Add a PostProcessor object to the end of the chain."""
775 assert when in POSTPROCESS_WHEN, f'Invalid when={when}'
776 self._pps[when].append(pp)
777 pp.set_downloader(self)
778
779 def add_post_hook(self, ph):
780 """Add the post hook"""
781 self._post_hooks.append(ph)
782
783 def add_progress_hook(self, ph):
784 """Add the download progress hook"""
785 self._progress_hooks.append(ph)
786
787 def add_postprocessor_hook(self, ph):
788 """Add the postprocessing progress hook"""
789 self._postprocessor_hooks.append(ph)
790 for pps in self._pps.values():
791 for pp in pps:
792 pp.add_progress_hook(ph)
793
794 def _bidi_workaround(self, message):
795 if not hasattr(self, '_output_channel'):
796 return message
797
798 assert hasattr(self, '_output_process')
799 assert isinstance(message, str)
800 line_count = message.count('\n') + 1
801 self._output_process.stdin.write((message + '\n').encode())
802 self._output_process.stdin.flush()
803 res = ''.join(self._output_channel.readline().decode()
804 for _ in range(line_count))
805 return res[:-len('\n')]
806
807 def _write_string(self, message, out=None, only_once=False):
808 if only_once:
809 if message in self._printed_messages:
810 return
811 self._printed_messages.add(message)
812 write_string(message, out=out, encoding=self.params.get('encoding'))
813
814 def to_stdout(self, message, skip_eol=False, quiet=None):
815 """Print message to stdout"""
816 if quiet is not None:
817 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead')
818 if skip_eol is not False:
819 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. Use "YoutubeDL.to_screen" instead')
820 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out)
821
822 def to_screen(self, message, skip_eol=False, quiet=None):
823 """Print message to screen if not in quiet mode"""
824 if self.params.get('logger'):
825 self.params['logger'].debug(message)
826 return
827 if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'):
828 return
829 self._write_string(
830 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
831 self._out_files.screen)
832
833 def to_stderr(self, message, only_once=False):
834 """Print message to stderr"""
835 assert isinstance(message, str)
836 if self.params.get('logger'):
837 self.params['logger'].error(message)
838 else:
839 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once)
840
841 def _send_console_code(self, code):
842 if compat_os_name == 'nt' or not self._out_files.console:
843 return
844 self._write_string(code, self._out_files.console)
845
846 def to_console_title(self, message):
847 if not self.params.get('consoletitle', False):
848 return
849 message = remove_terminal_sequences(message)
850 if compat_os_name == 'nt':
851 if ctypes.windll.kernel32.GetConsoleWindow():
852 # c_wchar_p() might not be necessary if `message` is
853 # already of type unicode()
854 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
855 else:
856 self._send_console_code(f'\033]0;{message}\007')
857
858 def save_console_title(self):
859 if not self.params.get('consoletitle') or self.params.get('simulate'):
860 return
861 self._send_console_code('\033[22;0t') # Save the title on stack
862
863 def restore_console_title(self):
864 if not self.params.get('consoletitle') or self.params.get('simulate'):
865 return
866 self._send_console_code('\033[23;0t') # Restore the title from stack
867
868 def __enter__(self):
869 self.save_console_title()
870 return self
871
872 def __exit__(self, *args):
873 self.restore_console_title()
874
875 if self.params.get('cookiefile') is not None:
876 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
877
878 def trouble(self, message=None, tb=None, is_error=True):
879 """Determine action to take when a download problem appears.
880
881 Depending on if the downloader has been configured to ignore
882 download errors or not, this method may throw an exception or
883 not when errors are found, after printing the message.
884
885 @param tb If given, is additional traceback information
886 @param is_error Whether to raise error according to ignorerrors
887 """
888 if message is not None:
889 self.to_stderr(message)
890 if self.params.get('verbose'):
891 if tb is None:
892 if sys.exc_info()[0]: # if .trouble has been called from an except block
893 tb = ''
894 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
895 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
896 tb += encode_compat_str(traceback.format_exc())
897 else:
898 tb_data = traceback.format_list(traceback.extract_stack())
899 tb = ''.join(tb_data)
900 if tb:
901 self.to_stderr(tb)
902 if not is_error:
903 return
904 if not self.params.get('ignoreerrors'):
905 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
906 exc_info = sys.exc_info()[1].exc_info
907 else:
908 exc_info = sys.exc_info()
909 raise DownloadError(message, exc_info)
910 self._download_retcode = 1
911
912 Styles = Namespace(
913 HEADERS='yellow',
914 EMPHASIS='light blue',
915 FILENAME='green',
916 ID='green',
917 DELIM='blue',
918 ERROR='red',
919 WARNING='yellow',
920 SUPPRESS='light black',
921 )
922
923 def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
924 text = str(text)
925 if test_encoding:
926 original_text = text
927 # handle.encoding can be None. See https://github.com/yt-dlp/yt-dlp/issues/2711
928 encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii'
929 text = text.encode(encoding, 'ignore').decode(encoding)
930 if fallback is not None and text != original_text:
931 text = fallback
932 return format_text(text, f) if allow_colors else text if fallback is None else fallback
933
934 def _format_out(self, *args, **kwargs):
935 return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs)
936
937 def _format_screen(self, *args, **kwargs):
938 return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs)
939
940 def _format_err(self, *args, **kwargs):
941 return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs)
942
943 def report_warning(self, message, only_once=False):
944 '''
945 Print the message to stderr, it will be prefixed with 'WARNING:'
946 If stderr is a tty file the 'WARNING:' will be colored
947 '''
948 if self.params.get('logger') is not None:
949 self.params['logger'].warning(message)
950 else:
951 if self.params.get('no_warnings'):
952 return
953 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
954
955 def deprecation_warning(self, message):
956 if self.params.get('logger') is not None:
957 self.params['logger'].warning(f'DeprecationWarning: {message}')
958 else:
959 self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True)
960
961 def report_error(self, message, *args, **kwargs):
962 '''
963 Do the same as trouble, but prefixes the message with 'ERROR:', colored
964 in red if stderr is a tty file.
965 '''
966 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
967
968 def write_debug(self, message, only_once=False):
969 '''Log debug message or Print message to stderr'''
970 if not self.params.get('verbose', False):
971 return
972 message = f'[debug] {message}'
973 if self.params.get('logger'):
974 self.params['logger'].debug(message)
975 else:
976 self.to_stderr(message, only_once)
977
978 def report_file_already_downloaded(self, file_name):
979 """Report file has already been fully downloaded."""
980 try:
981 self.to_screen('[download] %s has already been downloaded' % file_name)
982 except UnicodeEncodeError:
983 self.to_screen('[download] The file has already been downloaded')
984
985 def report_file_delete(self, file_name):
986 """Report that existing file will be deleted."""
987 try:
988 self.to_screen('Deleting existing file %s' % file_name)
989 except UnicodeEncodeError:
990 self.to_screen('Deleting existing file')
991
992 def raise_no_formats(self, info, forced=False, *, msg=None):
993 has_drm = info.get('_has_drm')
994 ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
995 msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!'
996 if forced or not ignored:
997 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
998 expected=has_drm or ignored or expected)
999 else:
1000 self.report_warning(msg)
1001
1002 def parse_outtmpl(self):
1003 self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version')
1004 self._parse_outtmpl()
1005 return self.params['outtmpl']
1006
1007 def _parse_outtmpl(self):
1008 sanitize = IDENTITY
1009 if self.params.get('restrictfilenames'): # Remove spaces in the default template
1010 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
1011
1012 outtmpl = self.params.setdefault('outtmpl', {})
1013 if not isinstance(outtmpl, dict):
1014 self.params['outtmpl'] = outtmpl = {'default': outtmpl}
1015 outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None})
1016
1017 def get_output_path(self, dir_type='', filename=None):
1018 paths = self.params.get('paths', {})
1019 assert isinstance(paths, dict)
1020 path = os.path.join(
1021 expand_path(paths.get('home', '').strip()),
1022 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
1023 filename or '')
1024 return sanitize_path(path, force=self.params.get('windowsfilenames'))
1025
1026 @staticmethod
1027 def _outtmpl_expandpath(outtmpl):
1028 # expand_path translates '%%' into '%' and '$$' into '$'
1029 # correspondingly that is not what we want since we need to keep
1030 # '%%' intact for template dict substitution step. Working around
1031 # with boundary-alike separator hack.
1032 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
1033 outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$')
1034
1035 # outtmpl should be expand_path'ed before template dict substitution
1036 # because meta fields may contain env variables we don't want to
1037 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
1038 # title "Hello $PATH", we don't want `$PATH` to be expanded.
1039 return expand_path(outtmpl).replace(sep, '')
1040
1041 @staticmethod
1042 def escape_outtmpl(outtmpl):
1043 ''' Escape any remaining strings like %s, %abc% etc. '''
1044 return re.sub(
1045 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
1046 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
1047 outtmpl)
1048
1049 @classmethod
1050 def validate_outtmpl(cls, outtmpl):
1051 ''' @return None or Exception object '''
1052 outtmpl = re.sub(
1053 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'),
1054 lambda mobj: f'{mobj.group(0)[:-1]}s',
1055 cls._outtmpl_expandpath(outtmpl))
1056 try:
1057 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
1058 return None
1059 except ValueError as err:
1060 return err
1061
1062 @staticmethod
1063 def _copy_infodict(info_dict):
1064 info_dict = dict(info_dict)
1065 info_dict.pop('__postprocessors', None)
1066 info_dict.pop('__pending_error', None)
1067 return info_dict
1068
1069 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
1070 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
1071 @param sanitize Whether to sanitize the output as a filename.
1072 For backward compatibility, a function can also be passed
1073 """
1074
1075 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1076
1077 info_dict = self._copy_infodict(info_dict)
1078 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1079 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1080 if info_dict.get('duration', None) is not None
1081 else None)
1082 info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads)
1083 info_dict['video_autonumber'] = self._num_videos
1084 if info_dict.get('resolution') is None:
1085 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1086
1087 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1088 # of %(field)s to %(field)0Nd for backward compatibility
1089 field_size_compat_map = {
1090 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0),
1091 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1092 'autonumber': self.params.get('autonumber_size') or 5,
1093 }
1094
1095 TMPL_DICT = {}
1096 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]'))
1097 MATH_FUNCTIONS = {
1098 '+': float.__add__,
1099 '-': float.__sub__,
1100 }
1101 # Field is of the form key1.key2...
1102 # where keys (except first) can be string, int or slice
1103 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
1104 MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
1105 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
1106 INTERNAL_FORMAT_RE = re.compile(rf'''(?x)
1107 (?P<negate>-)?
1108 (?P<fields>{FIELD_RE})
1109 (?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
1110 (?:>(?P<strf_format>.+?))?
1111 (?P<remaining>
1112 (?P<alternate>(?<!\\),[^|&)]+)?
1113 (?:&(?P<replacement>.*?))?
1114 (?:\|(?P<default>.*?))?
1115 )$''')
1116
1117 def _traverse_infodict(k):
1118 k = k.split('.')
1119 if k[0] == '':
1120 k.pop(0)
1121 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
1122
1123 def get_value(mdict):
1124 # Object traversal
1125 value = _traverse_infodict(mdict['fields'])
1126 # Negative
1127 if mdict['negate']:
1128 value = float_or_none(value)
1129 if value is not None:
1130 value *= -1
1131 # Do maths
1132 offset_key = mdict['maths']
1133 if offset_key:
1134 value = float_or_none(value)
1135 operator = None
1136 while offset_key:
1137 item = re.match(
1138 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1139 offset_key).group(0)
1140 offset_key = offset_key[len(item):]
1141 if operator is None:
1142 operator = MATH_FUNCTIONS[item]
1143 continue
1144 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1145 offset = float_or_none(item)
1146 if offset is None:
1147 offset = float_or_none(_traverse_infodict(item))
1148 try:
1149 value = operator(value, multiplier * offset)
1150 except (TypeError, ZeroDivisionError):
1151 return None
1152 operator = None
1153 # Datetime formatting
1154 if mdict['strf_format']:
1155 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1156
1157 return value
1158
1159 na = self.params.get('outtmpl_na_placeholder', 'NA')
1160
1161 def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
1162 return sanitize_filename(str(value), restricted=restricted, is_id=(
1163 bool(re.search(r'(^|[_.])id(\.|$)', key))
1164 if 'filename-sanitization' in self.params['compat_opts']
1165 else NO_DEFAULT))
1166
1167 sanitizer = sanitize if callable(sanitize) else filename_sanitizer
1168 sanitize = bool(sanitize)
1169
1170 def _dumpjson_default(obj):
1171 if isinstance(obj, (set, LazyList)):
1172 return list(obj)
1173 return repr(obj)
1174
1175 def create_key(outer_mobj):
1176 if not outer_mobj.group('has_key'):
1177 return outer_mobj.group(0)
1178 key = outer_mobj.group('key')
1179 mobj = re.match(INTERNAL_FORMAT_RE, key)
1180 initial_field = mobj.group('fields') if mobj else ''
1181 value, replacement, default = None, None, na
1182 while mobj:
1183 mobj = mobj.groupdict()
1184 default = mobj['default'] if mobj['default'] is not None else default
1185 value = get_value(mobj)
1186 replacement = mobj['replacement']
1187 if value is None and mobj['alternate']:
1188 mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
1189 else:
1190 break
1191
1192 fmt = outer_mobj.group('format')
1193 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1194 fmt = f'0{field_size_compat_map[key]:d}d'
1195
1196 value = default if value is None else value if replacement is None else replacement
1197
1198 flags = outer_mobj.group('conversion') or ''
1199 str_fmt = f'{fmt[:-1]}s'
1200 if fmt[-1] == 'l': # list
1201 delim = '\n' if '#' in flags else ', '
1202 value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
1203 elif fmt[-1] == 'j': # json
1204 value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
1205 elif fmt[-1] == 'h': # html
1206 value, fmt = escapeHTML(value), str_fmt
1207 elif fmt[-1] == 'q': # quoted
1208 value = map(str, variadic(value) if '#' in flags else [value])
1209 value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
1210 elif fmt[-1] == 'B': # bytes
1211 value = f'%{str_fmt}'.encode() % str(value).encode()
1212 value, fmt = value.decode('utf-8', 'ignore'), 's'
1213 elif fmt[-1] == 'U': # unicode normalized
1214 value, fmt = unicodedata.normalize(
1215 # "+" = compatibility equivalence, "#" = NFD
1216 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1217 value), str_fmt
1218 elif fmt[-1] == 'D': # decimal suffix
1219 num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
1220 value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
1221 factor=1024 if '#' in flags else 1000)
1222 elif fmt[-1] == 'S': # filename sanitization
1223 value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
1224 elif fmt[-1] == 'c':
1225 if value:
1226 value = str(value)[0]
1227 else:
1228 fmt = str_fmt
1229 elif fmt[-1] not in 'rs': # numeric
1230 value = float_or_none(value)
1231 if value is None:
1232 value, fmt = default, 's'
1233
1234 if sanitize:
1235 if fmt[-1] == 'r':
1236 # If value is an object, sanitize might convert it to a string
1237 # So we convert it to repr first
1238 value, fmt = repr(value), str_fmt
1239 if fmt[-1] in 'csr':
1240 value = sanitizer(initial_field, value)
1241
1242 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1243 TMPL_DICT[key] = value
1244 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1245
1246 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1247
1248 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1249 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1250 return self.escape_outtmpl(outtmpl) % info_dict
1251
1252 def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
1253 assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
1254 if outtmpl is None:
1255 outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default'])
1256 try:
1257 outtmpl = self._outtmpl_expandpath(outtmpl)
1258 filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
1259 if not filename:
1260 return None
1261
1262 if tmpl_type in ('', 'temp'):
1263 final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
1264 if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
1265 filename = replace_extension(filename, ext, final_ext)
1266 elif tmpl_type:
1267 force_ext = OUTTMPL_TYPES[tmpl_type]
1268 if force_ext:
1269 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1270
1271 # https://github.com/blackjack4494/youtube-dlc/issues/85
1272 trim_file_name = self.params.get('trim_file_name', False)
1273 if trim_file_name:
1274 no_ext, *ext = filename.rsplit('.', 2)
1275 filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
1276
1277 return filename
1278 except ValueError as err:
1279 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1280 return None
1281
1282 def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
1283 """Generate the output filename"""
1284 if outtmpl:
1285 assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
1286 dir_type = None
1287 filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
1288 if not filename and dir_type not in ('', 'temp'):
1289 return ''
1290
1291 if warn:
1292 if not self.params.get('paths'):
1293 pass
1294 elif filename == '-':
1295 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1296 elif os.path.isabs(filename):
1297 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1298 if filename == '-' or not filename:
1299 return filename
1300
1301 return self.get_output_path(dir_type, filename)
1302
1303 def _match_entry(self, info_dict, incomplete=False, silent=False):
1304 """ Returns None if the file should be downloaded """
1305
1306 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1307
1308 def check_filter():
1309 if 'title' in info_dict:
1310 # This can happen when we're just evaluating the playlist
1311 title = info_dict['title']
1312 matchtitle = self.params.get('matchtitle', False)
1313 if matchtitle:
1314 if not re.search(matchtitle, title, re.IGNORECASE):
1315 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1316 rejecttitle = self.params.get('rejecttitle', False)
1317 if rejecttitle:
1318 if re.search(rejecttitle, title, re.IGNORECASE):
1319 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1320 date = info_dict.get('upload_date')
1321 if date is not None:
1322 dateRange = self.params.get('daterange', DateRange())
1323 if date not in dateRange:
1324 return f'{date_from_str(date).isoformat()} upload date is not in range {dateRange}'
1325 view_count = info_dict.get('view_count')
1326 if view_count is not None:
1327 min_views = self.params.get('min_views')
1328 if min_views is not None and view_count < min_views:
1329 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1330 max_views = self.params.get('max_views')
1331 if max_views is not None and view_count > max_views:
1332 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1333 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1334 return 'Skipping "%s" because it is age restricted' % video_title
1335
1336 match_filter = self.params.get('match_filter')
1337 if match_filter is not None:
1338 try:
1339 ret = match_filter(info_dict, incomplete=incomplete)
1340 except TypeError:
1341 # For backward compatibility
1342 ret = None if incomplete else match_filter(info_dict)
1343 if ret is NO_DEFAULT:
1344 while True:
1345 filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
1346 reply = input(self._format_screen(
1347 f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
1348 if reply in {'y', ''}:
1349 return None
1350 elif reply == 'n':
1351 return f'Skipping {video_title}'
1352 elif ret is not None:
1353 return ret
1354 return None
1355
1356 if self.in_download_archive(info_dict):
1357 reason = '%s has already been recorded in the archive' % video_title
1358 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1359 else:
1360 reason = check_filter()
1361 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1362 if reason is not None:
1363 if not silent:
1364 self.to_screen('[download] ' + reason)
1365 if self.params.get(break_opt, False):
1366 raise break_err()
1367 return reason
1368
1369 @staticmethod
1370 def add_extra_info(info_dict, extra_info):
1371 '''Set the keys from extra_info in info dict if they are missing'''
1372 for key, value in extra_info.items():
1373 info_dict.setdefault(key, value)
1374
1375 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1376 process=True, force_generic_extractor=False):
1377 """
1378 Return a list with a dictionary for each video extracted.
1379
1380 Arguments:
1381 url -- URL to extract
1382
1383 Keyword arguments:
1384 download -- whether to download videos during extraction
1385 ie_key -- extractor key hint
1386 extra_info -- dictionary containing the extra values to add to each result
1387 process -- whether to resolve all unresolved references (URLs, playlist items),
1388 must be True for download to work.
1389 force_generic_extractor -- force using the generic extractor
1390 """
1391
1392 if extra_info is None:
1393 extra_info = {}
1394
1395 if not ie_key and force_generic_extractor:
1396 ie_key = 'Generic'
1397
1398 if ie_key:
1399 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1400 else:
1401 ies = self._ies
1402
1403 for ie_key, ie in ies.items():
1404 if not ie.suitable(url):
1405 continue
1406
1407 if not ie.working():
1408 self.report_warning('The program functionality for this site has been marked as broken, '
1409 'and will probably not work.')
1410
1411 temp_id = ie.get_temp_id(url)
1412 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1413 self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
1414 if self.params.get('break_on_existing', False):
1415 raise ExistingVideoReached()
1416 break
1417 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1418 else:
1419 self.report_error('no suitable InfoExtractor for URL %s' % url)
1420
1421 def _handle_extraction_exceptions(func):
1422 @functools.wraps(func)
1423 def wrapper(self, *args, **kwargs):
1424 while True:
1425 try:
1426 return func(self, *args, **kwargs)
1427 except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
1428 raise
1429 except ReExtractInfo as e:
1430 if e.expected:
1431 self.to_screen(f'{e}; Re-extracting data')
1432 else:
1433 self.to_stderr('\r')
1434 self.report_warning(f'{e}; Re-extracting data')
1435 continue
1436 except GeoRestrictedError as e:
1437 msg = e.msg
1438 if e.countries:
1439 msg += '\nThis video is available in %s.' % ', '.join(
1440 map(ISO3166Utils.short2full, e.countries))
1441 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1442 self.report_error(msg)
1443 except ExtractorError as e: # An error we somewhat expected
1444 self.report_error(str(e), e.format_traceback())
1445 except Exception as e:
1446 if self.params.get('ignoreerrors'):
1447 self.report_error(str(e), tb=encode_compat_str(traceback.format_exc()))
1448 else:
1449 raise
1450 break
1451 return wrapper
1452
1453 def _wait_for_video(self, ie_result):
1454 if (not self.params.get('wait_for_video')
1455 or ie_result.get('_type', 'video') != 'video'
1456 or ie_result.get('formats') or ie_result.get('url')):
1457 return
1458
1459 format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
1460 last_msg = ''
1461
1462 def progress(msg):
1463 nonlocal last_msg
1464 self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True)
1465 last_msg = msg
1466
1467 min_wait, max_wait = self.params.get('wait_for_video')
1468 diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
1469 if diff is None and ie_result.get('live_status') == 'is_upcoming':
1470 diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
1471 self.report_warning('Release time of video is not known')
1472 elif (diff or 0) <= 0:
1473 self.report_warning('Video should already be available according to extracted info')
1474 diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
1475 self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
1476
1477 wait_till = time.time() + diff
1478 try:
1479 while True:
1480 diff = wait_till - time.time()
1481 if diff <= 0:
1482 progress('')
1483 raise ReExtractInfo('[wait] Wait period ended', expected=True)
1484 progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
1485 time.sleep(1)
1486 except KeyboardInterrupt:
1487 progress('')
1488 raise ReExtractInfo('[wait] Interrupted by user', expected=True)
1489 except BaseException as e:
1490 if not isinstance(e, ReExtractInfo):
1491 self.to_screen('')
1492 raise
1493
1494 @_handle_extraction_exceptions
1495 def __extract_info(self, url, ie, download, extra_info, process):
1496 ie_result = ie.extract(url)
1497 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1498 self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}')
1499 return
1500 if isinstance(ie_result, list):
1501 # Backwards compatibility: old IE result format
1502 ie_result = {
1503 '_type': 'compat_list',
1504 'entries': ie_result,
1505 }
1506 if extra_info.get('original_url'):
1507 ie_result.setdefault('original_url', extra_info['original_url'])
1508 self.add_default_extra_info(ie_result, ie, url)
1509 if process:
1510 self._wait_for_video(ie_result)
1511 return self.process_ie_result(ie_result, download, extra_info)
1512 else:
1513 return ie_result
1514
1515 def add_default_extra_info(self, ie_result, ie, url):
1516 if url is not None:
1517 self.add_extra_info(ie_result, {
1518 'webpage_url': url,
1519 'original_url': url,
1520 })
1521 webpage_url = ie_result.get('webpage_url')
1522 if webpage_url:
1523 self.add_extra_info(ie_result, {
1524 'webpage_url_basename': url_basename(webpage_url),
1525 'webpage_url_domain': get_domain(webpage_url),
1526 })
1527 if ie is not None:
1528 self.add_extra_info(ie_result, {
1529 'extractor': ie.IE_NAME,
1530 'extractor_key': ie.ie_key(),
1531 })
1532
1533 def process_ie_result(self, ie_result, download=True, extra_info=None):
1534 """
1535 Take the result of the ie(may be modified) and resolve all unresolved
1536 references (URLs, playlist items).
1537
1538 It will also download the videos if 'download'.
1539 Returns the resolved ie_result.
1540 """
1541 if extra_info is None:
1542 extra_info = {}
1543 result_type = ie_result.get('_type', 'video')
1544
1545 if result_type in ('url', 'url_transparent'):
1546 ie_result['url'] = sanitize_url(ie_result['url'])
1547 if ie_result.get('original_url'):
1548 extra_info.setdefault('original_url', ie_result['original_url'])
1549
1550 extract_flat = self.params.get('extract_flat', False)
1551 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1552 or extract_flat is True):
1553 info_copy = ie_result.copy()
1554 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1555 if ie and not ie_result.get('id'):
1556 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1557 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1558 self.add_extra_info(info_copy, extra_info)
1559 info_copy, _ = self.pre_process(info_copy)
1560 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1561 self._raise_pending_errors(info_copy)
1562 if self.params.get('force_write_download_archive', False):
1563 self.record_download_archive(info_copy)
1564 return ie_result
1565
1566 if result_type == 'video':
1567 self.add_extra_info(ie_result, extra_info)
1568 ie_result = self.process_video_result(ie_result, download=download)
1569 self._raise_pending_errors(ie_result)
1570 additional_urls = (ie_result or {}).get('additional_urls')
1571 if additional_urls:
1572 # TODO: Improve MetadataParserPP to allow setting a list
1573 if isinstance(additional_urls, str):
1574 additional_urls = [additional_urls]
1575 self.to_screen(
1576 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1577 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1578 ie_result['additional_entries'] = [
1579 self.extract_info(
1580 url, download, extra_info=extra_info,
1581 force_generic_extractor=self.params.get('force_generic_extractor'))
1582 for url in additional_urls
1583 ]
1584 return ie_result
1585 elif result_type == 'url':
1586 # We have to add extra_info to the results because it may be
1587 # contained in a playlist
1588 return self.extract_info(
1589 ie_result['url'], download,
1590 ie_key=ie_result.get('ie_key'),
1591 extra_info=extra_info)
1592 elif result_type == 'url_transparent':
1593 # Use the information from the embedding page
1594 info = self.extract_info(
1595 ie_result['url'], ie_key=ie_result.get('ie_key'),
1596 extra_info=extra_info, download=False, process=False)
1597
1598 # extract_info may return None when ignoreerrors is enabled and
1599 # extraction failed with an error, don't crash and return early
1600 # in this case
1601 if not info:
1602 return info
1603
1604 exempted_fields = {'_type', 'url', 'ie_key'}
1605 if not ie_result.get('section_end') and ie_result.get('section_start') is None:
1606 # For video clips, the id etc of the clip extractor should be used
1607 exempted_fields |= {'id', 'extractor', 'extractor_key'}
1608
1609 new_result = info.copy()
1610 new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields))
1611
1612 # Extracted info may not be a video result (i.e.
1613 # info.get('_type', 'video') != video) but rather an url or
1614 # url_transparent. In such cases outer metadata (from ie_result)
1615 # should be propagated to inner one (info). For this to happen
1616 # _type of info should be overridden with url_transparent. This
1617 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1618 if new_result.get('_type') == 'url':
1619 new_result['_type'] = 'url_transparent'
1620
1621 return self.process_ie_result(
1622 new_result, download=download, extra_info=extra_info)
1623 elif result_type in ('playlist', 'multi_video'):
1624 # Protect from infinite recursion due to recursively nested playlists
1625 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1626 webpage_url = ie_result['webpage_url']
1627 if webpage_url in self._playlist_urls:
1628 self.to_screen(
1629 '[download] Skipping already downloaded playlist: %s'
1630 % ie_result.get('title') or ie_result.get('id'))
1631 return
1632
1633 self._playlist_level += 1
1634 self._playlist_urls.add(webpage_url)
1635 self._fill_common_fields(ie_result, False)
1636 self._sanitize_thumbnails(ie_result)
1637 try:
1638 return self.__process_playlist(ie_result, download)
1639 finally:
1640 self._playlist_level -= 1
1641 if not self._playlist_level:
1642 self._playlist_urls.clear()
1643 elif result_type == 'compat_list':
1644 self.report_warning(
1645 'Extractor %s returned a compat_list result. '
1646 'It needs to be updated.' % ie_result.get('extractor'))
1647
1648 def _fixup(r):
1649 self.add_extra_info(r, {
1650 'extractor': ie_result['extractor'],
1651 'webpage_url': ie_result['webpage_url'],
1652 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1653 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1654 'extractor_key': ie_result['extractor_key'],
1655 })
1656 return r
1657 ie_result['entries'] = [
1658 self.process_ie_result(_fixup(r), download, extra_info)
1659 for r in ie_result['entries']
1660 ]
1661 return ie_result
1662 else:
1663 raise Exception('Invalid result type: %s' % result_type)
1664
1665 def _ensure_dir_exists(self, path):
1666 return make_dir(path, self.report_error)
1667
1668 @staticmethod
1669 def _playlist_infodict(ie_result, **kwargs):
1670 return {
1671 **ie_result,
1672 'playlist': ie_result.get('title') or ie_result.get('id'),
1673 'playlist_id': ie_result.get('id'),
1674 'playlist_title': ie_result.get('title'),
1675 'playlist_uploader': ie_result.get('uploader'),
1676 'playlist_uploader_id': ie_result.get('uploader_id'),
1677 'playlist_index': 0,
1678 **kwargs,
1679 }
1680
1681 def __process_playlist(self, ie_result, download):
1682 """Process each entry in the playlist"""
1683 title = ie_result.get('title') or ie_result.get('id') or '<Untitled>'
1684 self.to_screen(f'[download] Downloading playlist: {title}')
1685
1686 all_entries = PlaylistEntries(self, ie_result)
1687 entries = orderedSet(all_entries.get_requested_items(), lazy=True)
1688
1689 lazy = self.params.get('lazy_playlist')
1690 if lazy:
1691 resolved_entries, n_entries = [], 'N/A'
1692 ie_result['requested_entries'], ie_result['entries'] = None, None
1693 else:
1694 entries = resolved_entries = list(entries)
1695 n_entries = len(resolved_entries)
1696 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1697 if not ie_result.get('playlist_count'):
1698 # Better to do this after potentially exhausting entries
1699 ie_result['playlist_count'] = all_entries.get_full_count()
1700
1701 _infojson_written = False
1702 write_playlist_files = self.params.get('allow_playlist_files', True)
1703 if write_playlist_files and self.params.get('list_thumbnails'):
1704 self.list_thumbnails(ie_result)
1705 if write_playlist_files and not self.params.get('simulate'):
1706 ie_copy = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
1707 _infojson_written = self._write_info_json(
1708 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
1709 if _infojson_written is None:
1710 return
1711 if self._write_description('playlist', ie_result,
1712 self.prepare_filename(ie_copy, 'pl_description')) is None:
1713 return
1714 # TODO: This should be passed to ThumbnailsConvertor if necessary
1715 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1716
1717 if lazy:
1718 if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
1719 self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True)
1720 elif self.params.get('playlistreverse'):
1721 entries.reverse()
1722 elif self.params.get('playlistrandom'):
1723 random.shuffle(entries)
1724
1725 self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} videos'
1726 f'{format_field(ie_result, "playlist_count", " of %s")}')
1727
1728 failures = 0
1729 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1730 for i, (playlist_index, entry) in enumerate(entries):
1731 if lazy:
1732 resolved_entries.append((playlist_index, entry))
1733
1734 # TODO: Add auto-generated fields
1735 if not entry or self._match_entry(entry, incomplete=True) is not None:
1736 continue
1737
1738 self.to_screen('[download] Downloading video %s of %s' % (
1739 self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
1740
1741 entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
1742 if not lazy and 'playlist-index' in self.params.get('compat_opts', []):
1743 playlist_index = ie_result['requested_entries'][i]
1744
1745 entry_result = self.__process_iterable_entry(entry, download, {
1746 'n_entries': int_or_none(n_entries),
1747 '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
1748 'playlist_count': ie_result.get('playlist_count'),
1749 'playlist_index': playlist_index,
1750 'playlist_autonumber': i + 1,
1751 'playlist': title,
1752 'playlist_id': ie_result.get('id'),
1753 'playlist_title': ie_result.get('title'),
1754 'playlist_uploader': ie_result.get('uploader'),
1755 'playlist_uploader_id': ie_result.get('uploader_id'),
1756 'extractor': ie_result['extractor'],
1757 'webpage_url': ie_result['webpage_url'],
1758 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1759 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1760 'extractor_key': ie_result['extractor_key'],
1761 })
1762 if not entry_result:
1763 failures += 1
1764 if failures >= max_failures:
1765 self.report_error(
1766 f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction')
1767 break
1768 resolved_entries[i] = (playlist_index, entry_result)
1769
1770 # Update with processed data
1771 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1772
1773 # Write the updated info to json
1774 if _infojson_written is True and self._write_info_json(
1775 'updated playlist', ie_result,
1776 self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
1777 return
1778
1779 ie_result = self.run_all_pps('playlist', ie_result)
1780 self.to_screen(f'[download] Finished downloading playlist: {title}')
1781 return ie_result
1782
1783 @_handle_extraction_exceptions
1784 def __process_iterable_entry(self, entry, download, extra_info):
1785 return self.process_ie_result(
1786 entry, download=download, extra_info=extra_info)
1787
1788 def _build_format_filter(self, filter_spec):
1789 " Returns a function to filter the formats according to the filter_spec "
1790
1791 OPERATORS = {
1792 '<': operator.lt,
1793 '<=': operator.le,
1794 '>': operator.gt,
1795 '>=': operator.ge,
1796 '=': operator.eq,
1797 '!=': operator.ne,
1798 }
1799 operator_rex = re.compile(r'''(?x)\s*
1800 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1801 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1802 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1803 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1804 m = operator_rex.fullmatch(filter_spec)
1805 if m:
1806 try:
1807 comparison_value = int(m.group('value'))
1808 except ValueError:
1809 comparison_value = parse_filesize(m.group('value'))
1810 if comparison_value is None:
1811 comparison_value = parse_filesize(m.group('value') + 'B')
1812 if comparison_value is None:
1813 raise ValueError(
1814 'Invalid value %r in format specification %r' % (
1815 m.group('value'), filter_spec))
1816 op = OPERATORS[m.group('op')]
1817
1818 if not m:
1819 STR_OPERATORS = {
1820 '=': operator.eq,
1821 '^=': lambda attr, value: attr.startswith(value),
1822 '$=': lambda attr, value: attr.endswith(value),
1823 '*=': lambda attr, value: value in attr,
1824 '~=': lambda attr, value: value.search(attr) is not None
1825 }
1826 str_operator_rex = re.compile(r'''(?x)\s*
1827 (?P<key>[a-zA-Z0-9._-]+)\s*
1828 (?P<negation>!\s*)?(?P<op>%s)\s*(?P<none_inclusive>\?\s*)?
1829 (?P<quote>["'])?
1830 (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+))
1831 (?(quote)(?P=quote))\s*
1832 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1833 m = str_operator_rex.fullmatch(filter_spec)
1834 if m:
1835 if m.group('op') == '~=':
1836 comparison_value = re.compile(m.group('value'))
1837 else:
1838 comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value'))
1839 str_op = STR_OPERATORS[m.group('op')]
1840 if m.group('negation'):
1841 op = lambda attr, value: not str_op(attr, value)
1842 else:
1843 op = str_op
1844
1845 if not m:
1846 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1847
1848 def _filter(f):
1849 actual_value = f.get(m.group('key'))
1850 if actual_value is None:
1851 return m.group('none_inclusive')
1852 return op(actual_value, comparison_value)
1853 return _filter
1854
1855 def _check_formats(self, formats):
1856 for f in formats:
1857 self.to_screen('[info] Testing format %s' % f['format_id'])
1858 path = self.get_output_path('temp')
1859 if not self._ensure_dir_exists(f'{path}/'):
1860 continue
1861 temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
1862 temp_file.close()
1863 try:
1864 success, _ = self.dl(temp_file.name, f, test=True)
1865 except (DownloadError, OSError, ValueError) + network_exceptions:
1866 success = False
1867 finally:
1868 if os.path.exists(temp_file.name):
1869 try:
1870 os.remove(temp_file.name)
1871 except OSError:
1872 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1873 if success:
1874 yield f
1875 else:
1876 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1877
1878 def _default_format_spec(self, info_dict, download=True):
1879
1880 def can_merge():
1881 merger = FFmpegMergerPP(self)
1882 return merger.available and merger.can_merge()
1883
1884 prefer_best = (
1885 not self.params.get('simulate')
1886 and download
1887 and (
1888 not can_merge()
1889 or info_dict.get('is_live') and not self.params.get('live_from_start')
1890 or self.params['outtmpl']['default'] == '-'))
1891 compat = (
1892 prefer_best
1893 or self.params.get('allow_multiple_audio_streams', False)
1894 or 'format-spec' in self.params['compat_opts'])
1895
1896 return (
1897 'best/bestvideo+bestaudio' if prefer_best
1898 else 'bestvideo*+bestaudio/best' if not compat
1899 else 'bestvideo+bestaudio/best')
1900
1901 def build_format_selector(self, format_spec):
1902 def syntax_error(note, start):
1903 message = (
1904 'Invalid format specification: '
1905 '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1]))
1906 return SyntaxError(message)
1907
1908 PICKFIRST = 'PICKFIRST'
1909 MERGE = 'MERGE'
1910 SINGLE = 'SINGLE'
1911 GROUP = 'GROUP'
1912 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1913
1914 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1915 'video': self.params.get('allow_multiple_video_streams', False)}
1916
1917 check_formats = self.params.get('check_formats') == 'selected'
1918
1919 def _parse_filter(tokens):
1920 filter_parts = []
1921 for type, string, start, _, _ in tokens:
1922 if type == tokenize.OP and string == ']':
1923 return ''.join(filter_parts)
1924 else:
1925 filter_parts.append(string)
1926
1927 def _remove_unused_ops(tokens):
1928 # Remove operators that we don't use and join them with the surrounding strings
1929 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1930 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1931 last_string, last_start, last_end, last_line = None, None, None, None
1932 for type, string, start, end, line in tokens:
1933 if type == tokenize.OP and string == '[':
1934 if last_string:
1935 yield tokenize.NAME, last_string, last_start, last_end, last_line
1936 last_string = None
1937 yield type, string, start, end, line
1938 # everything inside brackets will be handled by _parse_filter
1939 for type, string, start, end, line in tokens:
1940 yield type, string, start, end, line
1941 if type == tokenize.OP and string == ']':
1942 break
1943 elif type == tokenize.OP and string in ALLOWED_OPS:
1944 if last_string:
1945 yield tokenize.NAME, last_string, last_start, last_end, last_line
1946 last_string = None
1947 yield type, string, start, end, line
1948 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1949 if not last_string:
1950 last_string = string
1951 last_start = start
1952 last_end = end
1953 else:
1954 last_string += string
1955 if last_string:
1956 yield tokenize.NAME, last_string, last_start, last_end, last_line
1957
1958 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1959 selectors = []
1960 current_selector = None
1961 for type, string, start, _, _ in tokens:
1962 # ENCODING is only defined in python 3.x
1963 if type == getattr(tokenize, 'ENCODING', None):
1964 continue
1965 elif type in [tokenize.NAME, tokenize.NUMBER]:
1966 current_selector = FormatSelector(SINGLE, string, [])
1967 elif type == tokenize.OP:
1968 if string == ')':
1969 if not inside_group:
1970 # ')' will be handled by the parentheses group
1971 tokens.restore_last_token()
1972 break
1973 elif inside_merge and string in ['/', ',']:
1974 tokens.restore_last_token()
1975 break
1976 elif inside_choice and string == ',':
1977 tokens.restore_last_token()
1978 break
1979 elif string == ',':
1980 if not current_selector:
1981 raise syntax_error('"," must follow a format selector', start)
1982 selectors.append(current_selector)
1983 current_selector = None
1984 elif string == '/':
1985 if not current_selector:
1986 raise syntax_error('"/" must follow a format selector', start)
1987 first_choice = current_selector
1988 second_choice = _parse_format_selection(tokens, inside_choice=True)
1989 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1990 elif string == '[':
1991 if not current_selector:
1992 current_selector = FormatSelector(SINGLE, 'best', [])
1993 format_filter = _parse_filter(tokens)
1994 current_selector.filters.append(format_filter)
1995 elif string == '(':
1996 if current_selector:
1997 raise syntax_error('Unexpected "("', start)
1998 group = _parse_format_selection(tokens, inside_group=True)
1999 current_selector = FormatSelector(GROUP, group, [])
2000 elif string == '+':
2001 if not current_selector:
2002 raise syntax_error('Unexpected "+"', start)
2003 selector_1 = current_selector
2004 selector_2 = _parse_format_selection(tokens, inside_merge=True)
2005 if not selector_2:
2006 raise syntax_error('Expected a selector', start)
2007 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
2008 else:
2009 raise syntax_error(f'Operator not recognized: "{string}"', start)
2010 elif type == tokenize.ENDMARKER:
2011 break
2012 if current_selector:
2013 selectors.append(current_selector)
2014 return selectors
2015
2016 def _merge(formats_pair):
2017 format_1, format_2 = formats_pair
2018
2019 formats_info = []
2020 formats_info.extend(format_1.get('requested_formats', (format_1,)))
2021 formats_info.extend(format_2.get('requested_formats', (format_2,)))
2022
2023 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
2024 get_no_more = {'video': False, 'audio': False}
2025 for (i, fmt_info) in enumerate(formats_info):
2026 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
2027 formats_info.pop(i)
2028 continue
2029 for aud_vid in ['audio', 'video']:
2030 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
2031 if get_no_more[aud_vid]:
2032 formats_info.pop(i)
2033 break
2034 get_no_more[aud_vid] = True
2035
2036 if len(formats_info) == 1:
2037 return formats_info[0]
2038
2039 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
2040 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
2041
2042 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
2043 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
2044
2045 output_ext = self.params.get('merge_output_format')
2046 if not output_ext:
2047 if the_only_video:
2048 output_ext = the_only_video['ext']
2049 elif the_only_audio and not video_fmts:
2050 output_ext = the_only_audio['ext']
2051 else:
2052 output_ext = 'mkv'
2053
2054 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
2055
2056 new_dict = {
2057 'requested_formats': formats_info,
2058 'format': '+'.join(filtered('format')),
2059 'format_id': '+'.join(filtered('format_id')),
2060 'ext': output_ext,
2061 'protocol': '+'.join(map(determine_protocol, formats_info)),
2062 'language': '+'.join(orderedSet(filtered('language'))) or None,
2063 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
2064 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
2065 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
2066 }
2067
2068 if the_only_video:
2069 new_dict.update({
2070 'width': the_only_video.get('width'),
2071 'height': the_only_video.get('height'),
2072 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
2073 'fps': the_only_video.get('fps'),
2074 'dynamic_range': the_only_video.get('dynamic_range'),
2075 'vcodec': the_only_video.get('vcodec'),
2076 'vbr': the_only_video.get('vbr'),
2077 'stretched_ratio': the_only_video.get('stretched_ratio'),
2078 })
2079
2080 if the_only_audio:
2081 new_dict.update({
2082 'acodec': the_only_audio.get('acodec'),
2083 'abr': the_only_audio.get('abr'),
2084 'asr': the_only_audio.get('asr'),
2085 })
2086
2087 return new_dict
2088
2089 def _check_formats(formats):
2090 if not check_formats:
2091 yield from formats
2092 return
2093 yield from self._check_formats(formats)
2094
2095 def _build_selector_function(selector):
2096 if isinstance(selector, list): # ,
2097 fs = [_build_selector_function(s) for s in selector]
2098
2099 def selector_function(ctx):
2100 for f in fs:
2101 yield from f(ctx)
2102 return selector_function
2103
2104 elif selector.type == GROUP: # ()
2105 selector_function = _build_selector_function(selector.selector)
2106
2107 elif selector.type == PICKFIRST: # /
2108 fs = [_build_selector_function(s) for s in selector.selector]
2109
2110 def selector_function(ctx):
2111 for f in fs:
2112 picked_formats = list(f(ctx))
2113 if picked_formats:
2114 return picked_formats
2115 return []
2116
2117 elif selector.type == MERGE: # +
2118 selector_1, selector_2 = map(_build_selector_function, selector.selector)
2119
2120 def selector_function(ctx):
2121 for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
2122 yield _merge(pair)
2123
2124 elif selector.type == SINGLE: # atom
2125 format_spec = selector.selector or 'best'
2126
2127 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2128 if format_spec == 'all':
2129 def selector_function(ctx):
2130 yield from _check_formats(ctx['formats'][::-1])
2131 elif format_spec == 'mergeall':
2132 def selector_function(ctx):
2133 formats = list(_check_formats(
2134 f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none'))
2135 if not formats:
2136 return
2137 merged_format = formats[-1]
2138 for f in formats[-2::-1]:
2139 merged_format = _merge((merged_format, f))
2140 yield merged_format
2141
2142 else:
2143 format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
2144 mobj = re.match(
2145 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2146 format_spec)
2147 if mobj is not None:
2148 format_idx = int_or_none(mobj.group('n'), default=1)
2149 format_reverse = mobj.group('bw')[0] == 'b'
2150 format_type = (mobj.group('type') or [None])[0]
2151 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2152 format_modified = mobj.group('mod') is not None
2153
2154 format_fallback = not format_type and not format_modified # for b, w
2155 _filter_f = (
2156 (lambda f: f.get('%scodec' % format_type) != 'none')
2157 if format_type and format_modified # bv*, ba*, wv*, wa*
2158 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
2159 if format_type # bv, ba, wv, wa
2160 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2161 if not format_modified # b, w
2162 else lambda f: True) # b*, w*
2163 filter_f = lambda f: _filter_f(f) and (
2164 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2165 else:
2166 if format_spec in self._format_selection_exts['audio']:
2167 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2168 elif format_spec in self._format_selection_exts['video']:
2169 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2170 seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
2171 elif format_spec in self._format_selection_exts['storyboards']:
2172 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2173 else:
2174 filter_f = lambda f: f.get('format_id') == format_spec # id
2175
2176 def selector_function(ctx):
2177 formats = list(ctx['formats'])
2178 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2179 if not matches:
2180 if format_fallback and ctx['incomplete_formats']:
2181 # for extractors with incomplete formats (audio only (soundcloud)
2182 # or video only (imgur)) best/worst will fallback to
2183 # best/worst {video,audio}-only format
2184 matches = formats
2185 elif seperate_fallback and not ctx['has_merged_format']:
2186 # for compatibility with youtube-dl when there is no pre-merged format
2187 matches = list(filter(seperate_fallback, formats))
2188 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2189 try:
2190 yield matches[format_idx - 1]
2191 except LazyList.IndexError:
2192 return
2193
2194 filters = [self._build_format_filter(f) for f in selector.filters]
2195
2196 def final_selector(ctx):
2197 ctx_copy = dict(ctx)
2198 for _filter in filters:
2199 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2200 return selector_function(ctx_copy)
2201 return final_selector
2202
2203 stream = io.BytesIO(format_spec.encode())
2204 try:
2205 tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
2206 except tokenize.TokenError:
2207 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2208
2209 class TokenIterator:
2210 def __init__(self, tokens):
2211 self.tokens = tokens
2212 self.counter = 0
2213
2214 def __iter__(self):
2215 return self
2216
2217 def __next__(self):
2218 if self.counter >= len(self.tokens):
2219 raise StopIteration()
2220 value = self.tokens[self.counter]
2221 self.counter += 1
2222 return value
2223
2224 next = __next__
2225
2226 def restore_last_token(self):
2227 self.counter -= 1
2228
2229 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2230 return _build_selector_function(parsed_selector)
2231
2232 def _calc_headers(self, info_dict):
2233 res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
2234
2235 cookies = self._calc_cookies(info_dict['url'])
2236 if cookies:
2237 res['Cookie'] = cookies
2238
2239 if 'X-Forwarded-For' not in res:
2240 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2241 if x_forwarded_for_ip:
2242 res['X-Forwarded-For'] = x_forwarded_for_ip
2243
2244 return res
2245
2246 def _calc_cookies(self, url):
2247 pr = sanitized_Request(url)
2248 self.cookiejar.add_cookie_header(pr)
2249 return pr.get_header('Cookie')
2250
2251 def _sort_thumbnails(self, thumbnails):
2252 thumbnails.sort(key=lambda t: (
2253 t.get('preference') if t.get('preference') is not None else -1,
2254 t.get('width') if t.get('width') is not None else -1,
2255 t.get('height') if t.get('height') is not None else -1,
2256 t.get('id') if t.get('id') is not None else '',
2257 t.get('url')))
2258
2259 def _sanitize_thumbnails(self, info_dict):
2260 thumbnails = info_dict.get('thumbnails')
2261 if thumbnails is None:
2262 thumbnail = info_dict.get('thumbnail')
2263 if thumbnail:
2264 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2265 if not thumbnails:
2266 return
2267
2268 def check_thumbnails(thumbnails):
2269 for t in thumbnails:
2270 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2271 try:
2272 self.urlopen(HEADRequest(t['url']))
2273 except network_exceptions as err:
2274 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2275 continue
2276 yield t
2277
2278 self._sort_thumbnails(thumbnails)
2279 for i, t in enumerate(thumbnails):
2280 if t.get('id') is None:
2281 t['id'] = '%d' % i
2282 if t.get('width') and t.get('height'):
2283 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2284 t['url'] = sanitize_url(t['url'])
2285
2286 if self.params.get('check_formats') is True:
2287 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
2288 else:
2289 info_dict['thumbnails'] = thumbnails
2290
2291 def _fill_common_fields(self, info_dict, is_video=True):
2292 # TODO: move sanitization here
2293 if is_video:
2294 # playlists are allowed to lack "title"
2295 title = info_dict.get('title', NO_DEFAULT)
2296 if title is NO_DEFAULT:
2297 raise ExtractorError('Missing "title" field in extractor result',
2298 video_id=info_dict['id'], ie=info_dict['extractor'])
2299 info_dict['fulltitle'] = title
2300 if not title:
2301 if title == '':
2302 self.write_debug('Extractor gave empty title. Creating a generic title')
2303 else:
2304 self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
2305 info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}'
2306
2307 if info_dict.get('duration') is not None:
2308 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2309
2310 for ts_key, date_key in (
2311 ('timestamp', 'upload_date'),
2312 ('release_timestamp', 'release_date'),
2313 ('modified_timestamp', 'modified_date'),
2314 ):
2315 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2316 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2317 # see http://bugs.python.org/issue1646728)
2318 with contextlib.suppress(ValueError, OverflowError, OSError):
2319 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2320 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2321
2322 live_keys = ('is_live', 'was_live')
2323 live_status = info_dict.get('live_status')
2324 if live_status is None:
2325 for key in live_keys:
2326 if info_dict.get(key) is False:
2327 continue
2328 if info_dict.get(key):
2329 live_status = key
2330 break
2331 if all(info_dict.get(key) is False for key in live_keys):
2332 live_status = 'not_live'
2333 if live_status:
2334 info_dict['live_status'] = live_status
2335 for key in live_keys:
2336 if info_dict.get(key) is None:
2337 info_dict[key] = (live_status == key)
2338
2339 # Auto generate title fields corresponding to the *_number fields when missing
2340 # in order to always have clean titles. This is very common for TV series.
2341 for field in ('chapter', 'season', 'episode'):
2342 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2343 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2344
2345 def _raise_pending_errors(self, info):
2346 err = info.pop('__pending_error', None)
2347 if err:
2348 self.report_error(err, tb=False)
2349
2350 def process_video_result(self, info_dict, download=True):
2351 assert info_dict.get('_type', 'video') == 'video'
2352 self._num_videos += 1
2353
2354 if 'id' not in info_dict:
2355 raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
2356 elif not info_dict.get('id'):
2357 raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
2358
2359 def report_force_conversion(field, field_not, conversion):
2360 self.report_warning(
2361 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2362 % (field, field_not, conversion))
2363
2364 def sanitize_string_field(info, string_field):
2365 field = info.get(string_field)
2366 if field is None or isinstance(field, str):
2367 return
2368 report_force_conversion(string_field, 'a string', 'string')
2369 info[string_field] = str(field)
2370
2371 def sanitize_numeric_fields(info):
2372 for numeric_field in self._NUMERIC_FIELDS:
2373 field = info.get(numeric_field)
2374 if field is None or isinstance(field, (int, float)):
2375 continue
2376 report_force_conversion(numeric_field, 'numeric', 'int')
2377 info[numeric_field] = int_or_none(field)
2378
2379 sanitize_string_field(info_dict, 'id')
2380 sanitize_numeric_fields(info_dict)
2381 if info_dict.get('section_end') and info_dict.get('section_start') is not None:
2382 info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3)
2383 if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
2384 self.report_warning('"duration" field is negative, there is an error in extractor')
2385
2386 chapters = info_dict.get('chapters') or []
2387 if chapters and chapters[0].get('start_time'):
2388 chapters.insert(0, {'start_time': 0})
2389
2390 dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')}
2391 for idx, (prev, current, next_) in enumerate(zip(
2392 (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1):
2393 if current.get('start_time') is None:
2394 current['start_time'] = prev.get('end_time')
2395 if not current.get('end_time'):
2396 current['end_time'] = next_.get('start_time')
2397 if not current.get('title'):
2398 current['title'] = f'<Untitled Chapter {idx}>'
2399
2400 if 'playlist' not in info_dict:
2401 # It isn't part of a playlist
2402 info_dict['playlist'] = None
2403 info_dict['playlist_index'] = None
2404
2405 self._sanitize_thumbnails(info_dict)
2406
2407 thumbnail = info_dict.get('thumbnail')
2408 thumbnails = info_dict.get('thumbnails')
2409 if thumbnail:
2410 info_dict['thumbnail'] = sanitize_url(thumbnail)
2411 elif thumbnails:
2412 info_dict['thumbnail'] = thumbnails[-1]['url']
2413
2414 if info_dict.get('display_id') is None and 'id' in info_dict:
2415 info_dict['display_id'] = info_dict['id']
2416
2417 self._fill_common_fields(info_dict)
2418
2419 for cc_kind in ('subtitles', 'automatic_captions'):
2420 cc = info_dict.get(cc_kind)
2421 if cc:
2422 for _, subtitle in cc.items():
2423 for subtitle_format in subtitle:
2424 if subtitle_format.get('url'):
2425 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2426 if subtitle_format.get('ext') is None:
2427 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2428
2429 automatic_captions = info_dict.get('automatic_captions')
2430 subtitles = info_dict.get('subtitles')
2431
2432 info_dict['requested_subtitles'] = self.process_subtitles(
2433 info_dict['id'], subtitles, automatic_captions)
2434
2435 if info_dict.get('formats') is None:
2436 # There's only one format available
2437 formats = [info_dict]
2438 else:
2439 formats = info_dict['formats']
2440
2441 # or None ensures --clean-infojson removes it
2442 info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
2443 if not self.params.get('allow_unplayable_formats'):
2444 formats = [f for f in formats if not f.get('has_drm')]
2445 if info_dict['_has_drm'] and all(
2446 f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
2447 self.report_warning(
2448 'This video is DRM protected and only images are available for download. '
2449 'Use --list-formats to see them')
2450
2451 get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
2452 if not get_from_start:
2453 info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
2454 if info_dict.get('is_live') and formats:
2455 formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
2456 if get_from_start and not formats:
2457 self.raise_no_formats(info_dict, msg=(
2458 '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
2459 'If you want to download from the current time, use --no-live-from-start'))
2460
2461 if not formats:
2462 self.raise_no_formats(info_dict)
2463
2464 def is_wellformed(f):
2465 url = f.get('url')
2466 if not url:
2467 self.report_warning(
2468 '"url" field is missing or empty - skipping format, '
2469 'there is an error in extractor')
2470 return False
2471 if isinstance(url, bytes):
2472 sanitize_string_field(f, 'url')
2473 return True
2474
2475 # Filter out malformed formats for better extraction robustness
2476 formats = list(filter(is_wellformed, formats))
2477
2478 formats_dict = {}
2479
2480 # We check that all the formats have the format and format_id fields
2481 for i, format in enumerate(formats):
2482 sanitize_string_field(format, 'format_id')
2483 sanitize_numeric_fields(format)
2484 format['url'] = sanitize_url(format['url'])
2485 if not format.get('format_id'):
2486 format['format_id'] = str(i)
2487 else:
2488 # Sanitize format_id from characters used in format selector expression
2489 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2490 format_id = format['format_id']
2491 if format_id not in formats_dict:
2492 formats_dict[format_id] = []
2493 formats_dict[format_id].append(format)
2494
2495 # Make sure all formats have unique format_id
2496 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2497 for format_id, ambiguous_formats in formats_dict.items():
2498 ambigious_id = len(ambiguous_formats) > 1
2499 for i, format in enumerate(ambiguous_formats):
2500 if ambigious_id:
2501 format['format_id'] = '%s-%d' % (format_id, i)
2502 if format.get('ext') is None:
2503 format['ext'] = determine_ext(format['url']).lower()
2504 # Ensure there is no conflict between id and ext in format selection
2505 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2506 if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
2507 format['format_id'] = 'f%s' % format['format_id']
2508
2509 for i, format in enumerate(formats):
2510 if format.get('format') is None:
2511 format['format'] = '{id} - {res}{note}'.format(
2512 id=format['format_id'],
2513 res=self.format_resolution(format),
2514 note=format_field(format, 'format_note', ' (%s)'),
2515 )
2516 if format.get('protocol') is None:
2517 format['protocol'] = determine_protocol(format)
2518 if format.get('resolution') is None:
2519 format['resolution'] = self.format_resolution(format, default=None)
2520 if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
2521 format['dynamic_range'] = 'SDR'
2522 if (info_dict.get('duration') and format.get('tbr')
2523 and not format.get('filesize') and not format.get('filesize_approx')):
2524 format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
2525
2526 # Add HTTP headers, so that external programs can use them from the
2527 # json output
2528 full_format_info = info_dict.copy()
2529 full_format_info.update(format)
2530 format['http_headers'] = self._calc_headers(full_format_info)
2531 # Remove private housekeeping stuff
2532 if '__x_forwarded_for_ip' in info_dict:
2533 del info_dict['__x_forwarded_for_ip']
2534
2535 if self.params.get('check_formats') is True:
2536 formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
2537
2538 if not formats or formats[0] is not info_dict:
2539 # only set the 'formats' fields if the original info_dict list them
2540 # otherwise we end up with a circular reference, the first (and unique)
2541 # element in the 'formats' field in info_dict is info_dict itself,
2542 # which can't be exported to json
2543 info_dict['formats'] = formats
2544
2545 info_dict, _ = self.pre_process(info_dict)
2546
2547 if self._match_entry(info_dict, incomplete=self._format_fields) is not None:
2548 return info_dict
2549
2550 self.post_extract(info_dict)
2551 info_dict, _ = self.pre_process(info_dict, 'after_filter')
2552
2553 # The pre-processors may have modified the formats
2554 formats = info_dict.get('formats', [info_dict])
2555
2556 list_only = self.params.get('simulate') is None and (
2557 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2558 interactive_format_selection = not list_only and self.format_selector == '-'
2559 if self.params.get('list_thumbnails'):
2560 self.list_thumbnails(info_dict)
2561 if self.params.get('listsubtitles'):
2562 if 'automatic_captions' in info_dict:
2563 self.list_subtitles(
2564 info_dict['id'], automatic_captions, 'automatic captions')
2565 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2566 if self.params.get('listformats') or interactive_format_selection:
2567 self.list_formats(info_dict)
2568 if list_only:
2569 # Without this printing, -F --print-json will not work
2570 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2571 return info_dict
2572
2573 format_selector = self.format_selector
2574 if format_selector is None:
2575 req_format = self._default_format_spec(info_dict, download=download)
2576 self.write_debug('Default format spec: %s' % req_format)
2577 format_selector = self.build_format_selector(req_format)
2578
2579 while True:
2580 if interactive_format_selection:
2581 req_format = input(
2582 self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS))
2583 try:
2584 format_selector = self.build_format_selector(req_format)
2585 except SyntaxError as err:
2586 self.report_error(err, tb=False, is_error=False)
2587 continue
2588
2589 formats_to_download = list(format_selector({
2590 'formats': formats,
2591 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
2592 'incomplete_formats': (
2593 # All formats are video-only or
2594 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2595 # all formats are audio-only
2596 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)),
2597 }))
2598 if interactive_format_selection and not formats_to_download:
2599 self.report_error('Requested format is not available', tb=False, is_error=False)
2600 continue
2601 break
2602
2603 if not formats_to_download:
2604 if not self.params.get('ignore_no_formats_error'):
2605 raise ExtractorError(
2606 'Requested format is not available. Use --list-formats for a list of available formats',
2607 expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
2608 self.report_warning('Requested format is not available')
2609 # Process what we can, even without any available formats.
2610 formats_to_download = [{}]
2611
2612 requested_ranges = self.params.get('download_ranges')
2613 if requested_ranges:
2614 requested_ranges = tuple(requested_ranges(info_dict, self))
2615
2616 best_format, downloaded_formats = formats_to_download[-1], []
2617 if download:
2618 if best_format:
2619 def to_screen(*msg):
2620 self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
2621
2622 to_screen(f'Downloading {len(formats_to_download)} format(s):',
2623 (f['format_id'] for f in formats_to_download))
2624 if requested_ranges:
2625 to_screen(f'Downloading {len(requested_ranges)} time ranges:',
2626 (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges))
2627 max_downloads_reached = False
2628
2629 for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]):
2630 new_info = self._copy_infodict(info_dict)
2631 new_info.update(fmt)
2632 offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
2633 if chapter or offset:
2634 new_info.update({
2635 'section_start': offset + chapter.get('start_time', 0),
2636 'section_end': offset + min(chapter.get('end_time', duration), duration),
2637 'section_title': chapter.get('title'),
2638 'section_number': chapter.get('index'),
2639 })
2640 downloaded_formats.append(new_info)
2641 try:
2642 self.process_info(new_info)
2643 except MaxDownloadsReached:
2644 max_downloads_reached = True
2645 self._raise_pending_errors(new_info)
2646 # Remove copied info
2647 for key, val in tuple(new_info.items()):
2648 if info_dict.get(key) == val:
2649 new_info.pop(key)
2650 if max_downloads_reached:
2651 break
2652
2653 write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats}
2654 assert write_archive.issubset({True, False, 'ignore'})
2655 if True in write_archive and False not in write_archive:
2656 self.record_download_archive(info_dict)
2657
2658 info_dict['requested_downloads'] = downloaded_formats
2659 info_dict = self.run_all_pps('after_video', info_dict)
2660 if max_downloads_reached:
2661 raise MaxDownloadsReached()
2662
2663 # We update the info dict with the selected best quality format (backwards compatibility)
2664 info_dict.update(best_format)
2665 return info_dict
2666
2667 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2668 """Select the requested subtitles and their format"""
2669 available_subs, normal_sub_langs = {}, []
2670 if normal_subtitles and self.params.get('writesubtitles'):
2671 available_subs.update(normal_subtitles)
2672 normal_sub_langs = tuple(normal_subtitles.keys())
2673 if automatic_captions and self.params.get('writeautomaticsub'):
2674 for lang, cap_info in automatic_captions.items():
2675 if lang not in available_subs:
2676 available_subs[lang] = cap_info
2677
2678 if (not self.params.get('writesubtitles') and not
2679 self.params.get('writeautomaticsub') or not
2680 available_subs):
2681 return None
2682
2683 all_sub_langs = tuple(available_subs.keys())
2684 if self.params.get('allsubtitles', False):
2685 requested_langs = all_sub_langs
2686 elif self.params.get('subtitleslangs', False):
2687 # A list is used so that the order of languages will be the same as
2688 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2689 requested_langs = []
2690 for lang_re in self.params.get('subtitleslangs'):
2691 discard = lang_re[0] == '-'
2692 if discard:
2693 lang_re = lang_re[1:]
2694 if lang_re == 'all':
2695 if discard:
2696 requested_langs = []
2697 else:
2698 requested_langs.extend(all_sub_langs)
2699 continue
2700 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2701 if discard:
2702 for lang in current_langs:
2703 while lang in requested_langs:
2704 requested_langs.remove(lang)
2705 else:
2706 requested_langs.extend(current_langs)
2707 requested_langs = orderedSet(requested_langs)
2708 elif normal_sub_langs:
2709 requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
2710 else:
2711 requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1]
2712 if requested_langs:
2713 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2714
2715 formats_query = self.params.get('subtitlesformat', 'best')
2716 formats_preference = formats_query.split('/') if formats_query else []
2717 subs = {}
2718 for lang in requested_langs:
2719 formats = available_subs.get(lang)
2720 if formats is None:
2721 self.report_warning(f'{lang} subtitles not available for {video_id}')
2722 continue
2723 for ext in formats_preference:
2724 if ext == 'best':
2725 f = formats[-1]
2726 break
2727 matches = list(filter(lambda f: f['ext'] == ext, formats))
2728 if matches:
2729 f = matches[-1]
2730 break
2731 else:
2732 f = formats[-1]
2733 self.report_warning(
2734 'No subtitle format found matching "%s" for language %s, '
2735 'using %s' % (formats_query, lang, f['ext']))
2736 subs[lang] = f
2737 return subs
2738
2739 def _forceprint(self, key, info_dict):
2740 if info_dict is None:
2741 return
2742 info_copy = info_dict.copy()
2743 info_copy['formats_table'] = self.render_formats_table(info_dict)
2744 info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
2745 info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
2746 info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
2747
2748 def format_tmpl(tmpl):
2749 mobj = re.match(r'\w+(=?)$', tmpl)
2750 if mobj and mobj.group(1):
2751 return f'{tmpl[:-1]} = %({tmpl[:-1]})r'
2752 elif mobj:
2753 return f'%({tmpl})s'
2754 return tmpl
2755
2756 for tmpl in self.params['forceprint'].get(key, []):
2757 self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
2758
2759 for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
2760 filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
2761 tmpl = format_tmpl(tmpl)
2762 self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
2763 if self._ensure_dir_exists(filename):
2764 with open(filename, 'a', encoding='utf-8') as f:
2765 f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
2766
2767 def __forced_printings(self, info_dict, filename, incomplete):
2768 def print_mandatory(field, actual_field=None):
2769 if actual_field is None:
2770 actual_field = field
2771 if (self.params.get('force%s' % field, False)
2772 and (not incomplete or info_dict.get(actual_field) is not None)):
2773 self.to_stdout(info_dict[actual_field])
2774
2775 def print_optional(field):
2776 if (self.params.get('force%s' % field, False)
2777 and info_dict.get(field) is not None):
2778 self.to_stdout(info_dict[field])
2779
2780 info_dict = info_dict.copy()
2781 if filename is not None:
2782 info_dict['filename'] = filename
2783 if info_dict.get('requested_formats') is not None:
2784 # For RTMP URLs, also include the playpath
2785 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2786 elif info_dict.get('url'):
2787 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2788
2789 if (self.params.get('forcejson')
2790 or self.params['forceprint'].get('video')
2791 or self.params['print_to_file'].get('video')):
2792 self.post_extract(info_dict)
2793 self._forceprint('video', info_dict)
2794
2795 print_mandatory('title')
2796 print_mandatory('id')
2797 print_mandatory('url', 'urls')
2798 print_optional('thumbnail')
2799 print_optional('description')
2800 print_optional('filename')
2801 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2802 self.to_stdout(formatSeconds(info_dict['duration']))
2803 print_mandatory('format')
2804
2805 if self.params.get('forcejson'):
2806 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2807
2808 def dl(self, name, info, subtitle=False, test=False):
2809 if not info.get('url'):
2810 self.raise_no_formats(info, True)
2811
2812 if test:
2813 verbose = self.params.get('verbose')
2814 params = {
2815 'test': True,
2816 'quiet': self.params.get('quiet') or not verbose,
2817 'verbose': verbose,
2818 'noprogress': not verbose,
2819 'nopart': True,
2820 'skip_unavailable_fragments': False,
2821 'keep_fragments': False,
2822 'overwrites': True,
2823 '_no_ytdl_file': True,
2824 }
2825 else:
2826 params = self.params
2827 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2828 if not test:
2829 for ph in self._progress_hooks:
2830 fd.add_progress_hook(ph)
2831 urls = '", "'.join(
2832 (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
2833 for f in info.get('requested_formats', []) or [info])
2834 self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"')
2835
2836 # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
2837 # But it may contain objects that are not deep-copyable
2838 new_info = self._copy_infodict(info)
2839 if new_info.get('http_headers') is None:
2840 new_info['http_headers'] = self._calc_headers(new_info)
2841 return fd.download(name, new_info, subtitle)
2842
2843 def existing_file(self, filepaths, *, default_overwrite=True):
2844 existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
2845 if existing_files and not self.params.get('overwrites', default_overwrite):
2846 return existing_files[0]
2847
2848 for file in existing_files:
2849 self.report_file_delete(file)
2850 os.remove(file)
2851 return None
2852
2853 def process_info(self, info_dict):
2854 """Process a single resolved IE result. (Modifies it in-place)"""
2855
2856 assert info_dict.get('_type', 'video') == 'video'
2857 original_infodict = info_dict
2858
2859 if 'format' not in info_dict and 'ext' in info_dict:
2860 info_dict['format'] = info_dict['ext']
2861
2862 # This is mostly just for backward compatibility of process_info
2863 # As a side-effect, this allows for format-specific filters
2864 if self._match_entry(info_dict) is not None:
2865 info_dict['__write_download_archive'] = 'ignore'
2866 return
2867
2868 # Does nothing under normal operation - for backward compatibility of process_info
2869 self.post_extract(info_dict)
2870 self._num_downloads += 1
2871
2872 # info_dict['_filename'] needs to be set for backward compatibility
2873 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2874 temp_filename = self.prepare_filename(info_dict, 'temp')
2875 files_to_move = {}
2876
2877 # Forced printings
2878 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2879
2880 def check_max_downloads():
2881 if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'):
2882 raise MaxDownloadsReached()
2883
2884 if self.params.get('simulate'):
2885 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
2886 check_max_downloads()
2887 return
2888
2889 if full_filename is None:
2890 return
2891 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2892 return
2893 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2894 return
2895
2896 if self._write_description('video', info_dict,
2897 self.prepare_filename(info_dict, 'description')) is None:
2898 return
2899
2900 sub_files = self._write_subtitles(info_dict, temp_filename)
2901 if sub_files is None:
2902 return
2903 files_to_move.update(dict(sub_files))
2904
2905 thumb_files = self._write_thumbnails(
2906 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2907 if thumb_files is None:
2908 return
2909 files_to_move.update(dict(thumb_files))
2910
2911 infofn = self.prepare_filename(info_dict, 'infojson')
2912 _infojson_written = self._write_info_json('video', info_dict, infofn)
2913 if _infojson_written:
2914 info_dict['infojson_filename'] = infofn
2915 # For backward compatibility, even though it was a private field
2916 info_dict['__infojson_filename'] = infofn
2917 elif _infojson_written is None:
2918 return
2919
2920 # Note: Annotations are deprecated
2921 annofn = None
2922 if self.params.get('writeannotations', False):
2923 annofn = self.prepare_filename(info_dict, 'annotation')
2924 if annofn:
2925 if not self._ensure_dir_exists(encodeFilename(annofn)):
2926 return
2927 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2928 self.to_screen('[info] Video annotations are already present')
2929 elif not info_dict.get('annotations'):
2930 self.report_warning('There are no annotations to write.')
2931 else:
2932 try:
2933 self.to_screen('[info] Writing video annotations to: ' + annofn)
2934 with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2935 annofile.write(info_dict['annotations'])
2936 except (KeyError, TypeError):
2937 self.report_warning('There are no annotations to write.')
2938 except OSError:
2939 self.report_error('Cannot write annotations file: ' + annofn)
2940 return
2941
2942 # Write internet shortcut files
2943 def _write_link_file(link_type):
2944 url = try_get(info_dict['webpage_url'], iri_to_uri)
2945 if not url:
2946 self.report_warning(
2947 f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
2948 return True
2949 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
2950 if not self._ensure_dir_exists(encodeFilename(linkfn)):
2951 return False
2952 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2953 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
2954 return True
2955 try:
2956 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
2957 with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
2958 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
2959 template_vars = {'url': url}
2960 if link_type == 'desktop':
2961 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
2962 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
2963 except OSError:
2964 self.report_error(f'Cannot write internet shortcut {linkfn}')
2965 return False
2966 return True
2967
2968 write_links = {
2969 'url': self.params.get('writeurllink'),
2970 'webloc': self.params.get('writewebloclink'),
2971 'desktop': self.params.get('writedesktoplink'),
2972 }
2973 if self.params.get('writelink'):
2974 link_type = ('webloc' if sys.platform == 'darwin'
2975 else 'desktop' if sys.platform.startswith('linux')
2976 else 'url')
2977 write_links[link_type] = True
2978
2979 if any(should_write and not _write_link_file(link_type)
2980 for link_type, should_write in write_links.items()):
2981 return
2982
2983 def replace_info_dict(new_info):
2984 nonlocal info_dict
2985 if new_info == info_dict:
2986 return
2987 info_dict.clear()
2988 info_dict.update(new_info)
2989
2990 new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2991 replace_info_dict(new_info)
2992
2993 if self.params.get('skip_download'):
2994 info_dict['filepath'] = temp_filename
2995 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2996 info_dict['__files_to_move'] = files_to_move
2997 replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
2998 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
2999 else:
3000 # Download
3001 info_dict.setdefault('__postprocessors', [])
3002 try:
3003
3004 def existing_video_file(*filepaths):
3005 ext = info_dict.get('ext')
3006 converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
3007 file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
3008 default_overwrite=False)
3009 if file:
3010 info_dict['ext'] = os.path.splitext(file)[1][1:]
3011 return file
3012
3013 fd, success = None, True
3014 if info_dict.get('protocol') or info_dict.get('url'):
3015 fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
3016 if fd is not FFmpegFD and (
3017 info_dict.get('section_start') or info_dict.get('section_end')):
3018 msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
3019 else 'You have requested downloading the video partially, but ffmpeg is not installed')
3020 self.report_error(f'{msg}. Aborting')
3021 return
3022
3023 if info_dict.get('requested_formats') is not None:
3024
3025 def compatible_formats(formats):
3026 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
3027 video_formats = [format for format in formats if format.get('vcodec') != 'none']
3028 audio_formats = [format for format in formats if format.get('acodec') != 'none']
3029 if len(video_formats) > 2 or len(audio_formats) > 2:
3030 return False
3031
3032 # Check extension
3033 exts = {format.get('ext') for format in formats}
3034 COMPATIBLE_EXTS = (
3035 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'},
3036 {'webm'},
3037 )
3038 for ext_sets in COMPATIBLE_EXTS:
3039 if ext_sets.issuperset(exts):
3040 return True
3041 # TODO: Check acodec/vcodec
3042 return False
3043
3044 requested_formats = info_dict['requested_formats']
3045 old_ext = info_dict['ext']
3046 if self.params.get('merge_output_format') is None:
3047 if not compatible_formats(requested_formats):
3048 info_dict['ext'] = 'mkv'
3049 self.report_warning(
3050 'Requested formats are incompatible for merge and will be merged into mkv')
3051 if (info_dict['ext'] == 'webm'
3052 and info_dict.get('thumbnails')
3053 # check with type instead of pp_key, __name__, or isinstance
3054 # since we dont want any custom PPs to trigger this
3055 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721
3056 info_dict['ext'] = 'mkv'
3057 self.report_warning(
3058 'webm doesn\'t support embedding a thumbnail, mkv will be used')
3059 new_ext = info_dict['ext']
3060
3061 def correct_ext(filename, ext=new_ext):
3062 if filename == '-':
3063 return filename
3064 filename_real_ext = os.path.splitext(filename)[1][1:]
3065 filename_wo_ext = (
3066 os.path.splitext(filename)[0]
3067 if filename_real_ext in (old_ext, new_ext)
3068 else filename)
3069 return f'{filename_wo_ext}.{ext}'
3070
3071 # Ensure filename always has a correct extension for successful merge
3072 full_filename = correct_ext(full_filename)
3073 temp_filename = correct_ext(temp_filename)
3074 dl_filename = existing_video_file(full_filename, temp_filename)
3075 info_dict['__real_download'] = False
3076
3077 merger = FFmpegMergerPP(self)
3078 downloaded = []
3079 if dl_filename is not None:
3080 self.report_file_already_downloaded(dl_filename)
3081 elif fd:
3082 for f in requested_formats if fd != FFmpegFD else []:
3083 f['filepath'] = fname = prepend_extension(
3084 correct_ext(temp_filename, info_dict['ext']),
3085 'f%s' % f['format_id'], info_dict['ext'])
3086 downloaded.append(fname)
3087 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
3088 success, real_download = self.dl(temp_filename, info_dict)
3089 info_dict['__real_download'] = real_download
3090 else:
3091 if self.params.get('allow_unplayable_formats'):
3092 self.report_warning(
3093 'You have requested merging of multiple formats '
3094 'while also allowing unplayable formats to be downloaded. '
3095 'The formats won\'t be merged to prevent data corruption.')
3096 elif not merger.available:
3097 msg = 'You have requested merging of multiple formats but ffmpeg is not installed'
3098 if not self.params.get('ignoreerrors'):
3099 self.report_error(f'{msg}. Aborting due to --abort-on-error')
3100 return
3101 self.report_warning(f'{msg}. The formats won\'t be merged')
3102
3103 if temp_filename == '-':
3104 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
3105 else 'but the formats are incompatible for simultaneous download' if merger.available
3106 else 'but ffmpeg is not installed')
3107 self.report_warning(
3108 f'You have requested downloading multiple formats to stdout {reason}. '
3109 'The formats will be streamed one after the other')
3110 fname = temp_filename
3111 for f in requested_formats:
3112 new_info = dict(info_dict)
3113 del new_info['requested_formats']
3114 new_info.update(f)
3115 if temp_filename != '-':
3116 fname = prepend_extension(
3117 correct_ext(temp_filename, new_info['ext']),
3118 'f%s' % f['format_id'], new_info['ext'])
3119 if not self._ensure_dir_exists(fname):
3120 return
3121 f['filepath'] = fname
3122 downloaded.append(fname)
3123 partial_success, real_download = self.dl(fname, new_info)
3124 info_dict['__real_download'] = info_dict['__real_download'] or real_download
3125 success = success and partial_success
3126
3127 if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
3128 info_dict['__postprocessors'].append(merger)
3129 info_dict['__files_to_merge'] = downloaded
3130 # Even if there were no downloads, it is being merged only now
3131 info_dict['__real_download'] = True
3132 else:
3133 for file in downloaded:
3134 files_to_move[file] = None
3135 else:
3136 # Just a single file
3137 dl_filename = existing_video_file(full_filename, temp_filename)
3138 if dl_filename is None or dl_filename == temp_filename:
3139 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
3140 # So we should try to resume the download
3141 success, real_download = self.dl(temp_filename, info_dict)
3142 info_dict['__real_download'] = real_download
3143 else:
3144 self.report_file_already_downloaded(dl_filename)
3145
3146 dl_filename = dl_filename or temp_filename
3147 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
3148
3149 except network_exceptions as err:
3150 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
3151 return
3152 except OSError as err:
3153 raise UnavailableVideoError(err)
3154 except (ContentTooShortError, ) as err:
3155 self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})')
3156 return
3157
3158 self._raise_pending_errors(info_dict)
3159 if success and full_filename != '-':
3160
3161 def fixup():
3162 do_fixup = True
3163 fixup_policy = self.params.get('fixup')
3164 vid = info_dict['id']
3165
3166 if fixup_policy in ('ignore', 'never'):
3167 return
3168 elif fixup_policy == 'warn':
3169 do_fixup = 'warn'
3170 elif fixup_policy != 'force':
3171 assert fixup_policy in ('detect_or_warn', None)
3172 if not info_dict.get('__real_download'):
3173 do_fixup = False
3174
3175 def ffmpeg_fixup(cndn, msg, cls):
3176 if not (do_fixup and cndn):
3177 return
3178 elif do_fixup == 'warn':
3179 self.report_warning(f'{vid}: {msg}')
3180 return
3181 pp = cls(self)
3182 if pp.available:
3183 info_dict['__postprocessors'].append(pp)
3184 else:
3185 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
3186
3187 stretched_ratio = info_dict.get('stretched_ratio')
3188 ffmpeg_fixup(stretched_ratio not in (1, None),
3189 f'Non-uniform pixel ratio {stretched_ratio}',
3190 FFmpegFixupStretchedPP)
3191
3192 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
3193 downloader = downloader.FD_NAME if downloader else None
3194
3195 ext = info_dict.get('ext')
3196 postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any((
3197 isinstance(pp, FFmpegVideoConvertorPP)
3198 and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None)
3199 ) for pp in self._pps['post_process'])
3200
3201 if not postprocessed_by_ffmpeg:
3202 ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash',
3203 'writing DASH m4a. Only some players support this container',
3204 FFmpegFixupM4aPP)
3205 ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')
3206 or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
3207 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
3208 FFmpegFixupM3u8PP)
3209 ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
3210 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
3211
3212 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
3213 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP)
3214
3215 fixup()
3216 try:
3217 replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
3218 except PostProcessingError as err:
3219 self.report_error('Postprocessing: %s' % str(err))
3220 return
3221 try:
3222 for ph in self._post_hooks:
3223 ph(info_dict['filepath'])
3224 except Exception as err:
3225 self.report_error('post hooks: %s' % str(err))
3226 return
3227 info_dict['__write_download_archive'] = True
3228
3229 assert info_dict is original_infodict # Make sure the info_dict was modified in-place
3230 if self.params.get('force_write_download_archive'):
3231 info_dict['__write_download_archive'] = True
3232 check_max_downloads()
3233
3234 def __download_wrapper(self, func):
3235 @functools.wraps(func)
3236 def wrapper(*args, **kwargs):
3237 try:
3238 res = func(*args, **kwargs)
3239 except UnavailableVideoError as e:
3240 self.report_error(e)
3241 except DownloadCancelled as e:
3242 self.to_screen(f'[info] {e}')
3243 if not self.params.get('break_per_url'):
3244 raise
3245 else:
3246 if self.params.get('dump_single_json', False):
3247 self.post_extract(res)
3248 self.to_stdout(json.dumps(self.sanitize_info(res)))
3249 return wrapper
3250
3251 def download(self, url_list):
3252 """Download a given list of URLs."""
3253 url_list = variadic(url_list) # Passing a single URL is a common mistake
3254 outtmpl = self.params['outtmpl']['default']
3255 if (len(url_list) > 1
3256 and outtmpl != '-'
3257 and '%' not in outtmpl
3258 and self.params.get('max_downloads') != 1):
3259 raise SameFileError(outtmpl)
3260
3261 for url in url_list:
3262 self.__download_wrapper(self.extract_info)(
3263 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
3264
3265 return self._download_retcode
3266
3267 def download_with_info_file(self, info_filename):
3268 with contextlib.closing(fileinput.FileInput(
3269 [info_filename], mode='r',
3270 openhook=fileinput.hook_encoded('utf-8'))) as f:
3271 # FileInput doesn't have a read method, we can't call json.load
3272 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
3273 try:
3274 self.__download_wrapper(self.process_ie_result)(info, download=True)
3275 except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
3276 if not isinstance(e, EntryNotInPlaylist):
3277 self.to_stderr('\r')
3278 webpage_url = info.get('webpage_url')
3279 if webpage_url is not None:
3280 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
3281 return self.download([webpage_url])
3282 else:
3283 raise
3284 return self._download_retcode
3285
3286 @staticmethod
3287 def sanitize_info(info_dict, remove_private_keys=False):
3288 ''' Sanitize the infodict for converting to json '''
3289 if info_dict is None:
3290 return info_dict
3291 info_dict.setdefault('epoch', int(time.time()))
3292 info_dict.setdefault('_type', 'video')
3293
3294 if remove_private_keys:
3295 reject = lambda k, v: v is None or k.startswith('__') or k in {
3296 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
3297 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber',
3298 }
3299 else:
3300 reject = lambda k, v: False
3301
3302 def filter_fn(obj):
3303 if isinstance(obj, dict):
3304 return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
3305 elif isinstance(obj, (list, tuple, set, LazyList)):
3306 return list(map(filter_fn, obj))
3307 elif obj is None or isinstance(obj, (str, int, float, bool)):
3308 return obj
3309 else:
3310 return repr(obj)
3311
3312 return filter_fn(info_dict)
3313
3314 @staticmethod
3315 def filter_requested_info(info_dict, actually_filter=True):
3316 ''' Alias of sanitize_info for backward compatibility '''
3317 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3318
3319 def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
3320 for filename in set(filter(None, files_to_delete)):
3321 if msg:
3322 self.to_screen(msg % filename)
3323 try:
3324 os.remove(filename)
3325 except OSError:
3326 self.report_warning(f'Unable to delete file {filename}')
3327 if filename in info.get('__files_to_move', []): # NB: Delete even if None
3328 del info['__files_to_move'][filename]
3329
3330 @staticmethod
3331 def post_extract(info_dict):
3332 def actual_post_extract(info_dict):
3333 if info_dict.get('_type') in ('playlist', 'multi_video'):
3334 for video_dict in info_dict.get('entries', {}):
3335 actual_post_extract(video_dict or {})
3336 return
3337
3338 post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {})
3339 info_dict.update(post_extractor())
3340
3341 actual_post_extract(info_dict or {})
3342
3343 def run_pp(self, pp, infodict):
3344 files_to_delete = []
3345 if '__files_to_move' not in infodict:
3346 infodict['__files_to_move'] = {}
3347 try:
3348 files_to_delete, infodict = pp.run(infodict)
3349 except PostProcessingError as e:
3350 # Must be True and not 'only_download'
3351 if self.params.get('ignoreerrors') is True:
3352 self.report_error(e)
3353 return infodict
3354 raise
3355
3356 if not files_to_delete:
3357 return infodict
3358 if self.params.get('keepvideo', False):
3359 for f in files_to_delete:
3360 infodict['__files_to_move'].setdefault(f, '')
3361 else:
3362 self._delete_downloaded_files(
3363 *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
3364 return infodict
3365
3366 def run_all_pps(self, key, info, *, additional_pps=None):
3367 self._forceprint(key, info)
3368 for pp in (additional_pps or []) + self._pps[key]:
3369 info = self.run_pp(pp, info)
3370 return info
3371
3372 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3373 info = dict(ie_info)
3374 info['__files_to_move'] = files_to_move or {}
3375 try:
3376 info = self.run_all_pps(key, info)
3377 except PostProcessingError as err:
3378 msg = f'Preprocessing: {err}'
3379 info.setdefault('__pending_error', msg)
3380 self.report_error(msg, is_error=False)
3381 return info, info.pop('__files_to_move', None)
3382
3383 def post_process(self, filename, info, files_to_move=None):
3384 """Run all the postprocessors on the given file."""
3385 info['filepath'] = filename
3386 info['__files_to_move'] = files_to_move or {}
3387 info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
3388 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3389 del info['__files_to_move']
3390 return self.run_all_pps('after_move', info)
3391
3392 def _make_archive_id(self, info_dict):
3393 video_id = info_dict.get('id')
3394 if not video_id:
3395 return
3396 # Future-proof against any change in case
3397 # and backwards compatibility with prior versions
3398 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3399 if extractor is None:
3400 url = str_or_none(info_dict.get('url'))
3401 if not url:
3402 return
3403 # Try to find matching extractor for the URL and take its ie_key
3404 for ie_key, ie in self._ies.items():
3405 if ie.suitable(url):
3406 extractor = ie_key
3407 break
3408 else:
3409 return
3410 return f'{extractor.lower()} {video_id}'
3411
3412 def in_download_archive(self, info_dict):
3413 fn = self.params.get('download_archive')
3414 if fn is None:
3415 return False
3416
3417 vid_id = self._make_archive_id(info_dict)
3418 if not vid_id:
3419 return False # Incomplete video information
3420
3421 return vid_id in self.archive
3422
3423 def record_download_archive(self, info_dict):
3424 fn = self.params.get('download_archive')
3425 if fn is None:
3426 return
3427 vid_id = self._make_archive_id(info_dict)
3428 assert vid_id
3429 self.write_debug(f'Adding to archive: {vid_id}')
3430 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3431 archive_file.write(vid_id + '\n')
3432 self.archive.add(vid_id)
3433
3434 @staticmethod
3435 def format_resolution(format, default='unknown'):
3436 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3437 return 'audio only'
3438 if format.get('resolution') is not None:
3439 return format['resolution']
3440 if format.get('width') and format.get('height'):
3441 return '%dx%d' % (format['width'], format['height'])
3442 elif format.get('height'):
3443 return '%sp' % format['height']
3444 elif format.get('width'):
3445 return '%dx?' % format['width']
3446 return default
3447
3448 def _list_format_headers(self, *headers):
3449 if self.params.get('listformats_table', True) is not False:
3450 return [self._format_out(header, self.Styles.HEADERS) for header in headers]
3451 return headers
3452
3453 def _format_note(self, fdict):
3454 res = ''
3455 if fdict.get('ext') in ['f4f', 'f4m']:
3456 res += '(unsupported)'
3457 if fdict.get('language'):
3458 if res:
3459 res += ' '
3460 res += '[%s]' % fdict['language']
3461 if fdict.get('format_note') is not None:
3462 if res:
3463 res += ' '
3464 res += fdict['format_note']
3465 if fdict.get('tbr') is not None:
3466 if res:
3467 res += ', '
3468 res += '%4dk' % fdict['tbr']
3469 if fdict.get('container') is not None:
3470 if res:
3471 res += ', '
3472 res += '%s container' % fdict['container']
3473 if (fdict.get('vcodec') is not None
3474 and fdict.get('vcodec') != 'none'):
3475 if res:
3476 res += ', '
3477 res += fdict['vcodec']
3478 if fdict.get('vbr') is not None:
3479 res += '@'
3480 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3481 res += 'video@'
3482 if fdict.get('vbr') is not None:
3483 res += '%4dk' % fdict['vbr']
3484 if fdict.get('fps') is not None:
3485 if res:
3486 res += ', '
3487 res += '%sfps' % fdict['fps']
3488 if fdict.get('acodec') is not None:
3489 if res:
3490 res += ', '
3491 if fdict['acodec'] == 'none':
3492 res += 'video only'
3493 else:
3494 res += '%-5s' % fdict['acodec']
3495 elif fdict.get('abr') is not None:
3496 if res:
3497 res += ', '
3498 res += 'audio'
3499 if fdict.get('abr') is not None:
3500 res += '@%3dk' % fdict['abr']
3501 if fdict.get('asr') is not None:
3502 res += ' (%5dHz)' % fdict['asr']
3503 if fdict.get('filesize') is not None:
3504 if res:
3505 res += ', '
3506 res += format_bytes(fdict['filesize'])
3507 elif fdict.get('filesize_approx') is not None:
3508 if res:
3509 res += ', '
3510 res += '~' + format_bytes(fdict['filesize_approx'])
3511 return res
3512
3513 def render_formats_table(self, info_dict):
3514 if not info_dict.get('formats') and not info_dict.get('url'):
3515 return None
3516
3517 formats = info_dict.get('formats', [info_dict])
3518 if not self.params.get('listformats_table', True) is not False:
3519 table = [
3520 [
3521 format_field(f, 'format_id'),
3522 format_field(f, 'ext'),
3523 self.format_resolution(f),
3524 self._format_note(f)
3525 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3526 return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
3527
3528 def simplified_codec(f, field):
3529 assert field in ('acodec', 'vcodec')
3530 codec = f.get(field, 'unknown')
3531 if codec != 'none':
3532 return '.'.join(codec.split('.')[:4])
3533
3534 if field == 'vcodec' and f.get('acodec') == 'none':
3535 return 'images'
3536 elif field == 'acodec' and f.get('vcodec') == 'none':
3537 return ''
3538 return self._format_out('audio only' if field == 'vcodec' else 'video only',
3539 self.Styles.SUPPRESS)
3540
3541 delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3542 table = [
3543 [
3544 self._format_out(format_field(f, 'format_id'), self.Styles.ID),
3545 format_field(f, 'ext'),
3546 format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
3547 format_field(f, 'fps', '\t%d', func=round),
3548 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3549 delim,
3550 format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
3551 format_field(f, 'tbr', '\t%dk', func=round),
3552 shorten_protocol_name(f.get('protocol', '')),
3553 delim,
3554 simplified_codec(f, 'vcodec'),
3555 format_field(f, 'vbr', '\t%dk', func=round),
3556 simplified_codec(f, 'acodec'),
3557 format_field(f, 'abr', '\t%dk', func=round),
3558 format_field(f, 'asr', '\t%s', func=format_decimal_suffix),
3559 join_nonempty(
3560 self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
3561 format_field(f, 'language', '[%s]'),
3562 join_nonempty(format_field(f, 'format_note'),
3563 format_field(f, 'container', ignore=(None, f.get('ext'))),
3564 delim=', '),
3565 delim=' '),
3566 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3567 header_line = self._list_format_headers(
3568 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
3569 delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
3570
3571 return render_table(
3572 header_line, table, hide_empty=True,
3573 delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True))
3574
3575 def render_thumbnails_table(self, info_dict):
3576 thumbnails = list(info_dict.get('thumbnails') or [])
3577 if not thumbnails:
3578 return None
3579 return render_table(
3580 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3581 [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
3582
3583 def render_subtitles_table(self, video_id, subtitles):
3584 def _row(lang, formats):
3585 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3586 if len(set(names)) == 1:
3587 names = [] if names[0] == 'unknown' else names[:1]
3588 return [lang, ', '.join(names), ', '.join(exts)]
3589
3590 if not subtitles:
3591 return None
3592 return render_table(
3593 self._list_format_headers('Language', 'Name', 'Formats'),
3594 [_row(lang, formats) for lang, formats in subtitles.items()],
3595 hide_empty=True)
3596
3597 def __list_table(self, video_id, name, func, *args):
3598 table = func(*args)
3599 if not table:
3600 self.to_screen(f'{video_id} has no {name}')
3601 return
3602 self.to_screen(f'[info] Available {name} for {video_id}:')
3603 self.to_stdout(table)
3604
3605 def list_formats(self, info_dict):
3606 self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
3607
3608 def list_thumbnails(self, info_dict):
3609 self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
3610
3611 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3612 self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
3613
3614 def urlopen(self, req):
3615 """ Start an HTTP download """
3616 if isinstance(req, str):
3617 req = sanitized_Request(req)
3618 return self._opener.open(req, timeout=self._socket_timeout)
3619
3620 def print_debug_header(self):
3621 if not self.params.get('verbose'):
3622 return
3623
3624 # These imports can be slow. So import them only as needed
3625 from .extractor.extractors import _LAZY_LOADER
3626 from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors
3627
3628 def get_encoding(stream):
3629 ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
3630 if not supports_terminal_sequences(stream):
3631 from .utils import WINDOWS_VT_MODE # Must be imported locally
3632 ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
3633 return ret
3634
3635 encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % (
3636 locale.getpreferredencoding(),
3637 sys.getfilesystemencoding(),
3638 self.get_encoding(),
3639 ', '.join(
3640 f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_
3641 if stream is not None and key != 'console')
3642 )
3643
3644 logger = self.params.get('logger')
3645 if logger:
3646 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
3647 write_debug(encoding_str)
3648 else:
3649 write_string(f'[debug] {encoding_str}\n', encoding=None)
3650 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
3651
3652 source = detect_variant()
3653 write_debug(join_nonempty(
3654 'yt-dlp version', __version__,
3655 f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
3656 '' if source == 'unknown' else f'({source})',
3657 delim=' '))
3658 if not _LAZY_LOADER:
3659 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
3660 write_debug('Lazy loading extractors is forcibly disabled')
3661 else:
3662 write_debug('Lazy loading extractors is disabled')
3663 if plugin_extractors or plugin_postprocessors:
3664 write_debug('Plugins: %s' % [
3665 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3666 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3667 if self.params['compat_opts']:
3668 write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts']))
3669
3670 if source == 'source':
3671 try:
3672 stdout, _, _ = Popen.run(
3673 ['git', 'rev-parse', '--short', 'HEAD'],
3674 text=True, cwd=os.path.dirname(os.path.abspath(__file__)),
3675 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3676 if re.fullmatch('[0-9a-f]+', stdout.strip()):
3677 write_debug(f'Git HEAD: {stdout.strip()}')
3678 except Exception:
3679 with contextlib.suppress(Exception):
3680 sys.exc_clear()
3681
3682 write_debug(system_identifier())
3683
3684 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
3685 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
3686 if ffmpeg_features:
3687 exe_versions['ffmpeg'] += ' (%s)' % ','.join(sorted(ffmpeg_features))
3688
3689 exe_versions['rtmpdump'] = rtmpdump_version()
3690 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3691 exe_str = ', '.join(
3692 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3693 ) or 'none'
3694 write_debug('exe versions: %s' % exe_str)
3695
3696 from .compat.compat_utils import get_package_info
3697 from .dependencies import available_dependencies
3698
3699 write_debug('Optional libraries: %s' % (', '.join(sorted({
3700 join_nonempty(*get_package_info(m)) for m in available_dependencies.values()
3701 })) or 'none'))
3702
3703 self._setup_opener()
3704 proxy_map = {}
3705 for handler in self._opener.handlers:
3706 if hasattr(handler, 'proxies'):
3707 proxy_map.update(handler.proxies)
3708 write_debug(f'Proxy map: {proxy_map}')
3709
3710 # Not implemented
3711 if False and self.params.get('call_home'):
3712 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode()
3713 write_debug('Public IP address: %s' % ipaddr)
3714 latest_version = self.urlopen(
3715 'https://yt-dl.org/latest/version').read().decode()
3716 if version_tuple(latest_version) > version_tuple(__version__):
3717 self.report_warning(
3718 'You are using an outdated version (newest version: %s)! '
3719 'See https://yt-dl.org/update if you need help updating.' %
3720 latest_version)
3721
3722 def _setup_opener(self):
3723 if hasattr(self, '_opener'):
3724 return
3725 timeout_val = self.params.get('socket_timeout')
3726 self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
3727
3728 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3729 opts_cookiefile = self.params.get('cookiefile')
3730 opts_proxy = self.params.get('proxy')
3731
3732 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3733
3734 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3735 if opts_proxy is not None:
3736 if opts_proxy == '':
3737 proxies = {}
3738 else:
3739 proxies = {'http': opts_proxy, 'https': opts_proxy}
3740 else:
3741 proxies = urllib.request.getproxies()
3742 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3743 if 'http' in proxies and 'https' not in proxies:
3744 proxies['https'] = proxies['http']
3745 proxy_handler = PerRequestProxyHandler(proxies)
3746
3747 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3748 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3749 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3750 redirect_handler = YoutubeDLRedirectHandler()
3751 data_handler = urllib.request.DataHandler()
3752
3753 # When passing our own FileHandler instance, build_opener won't add the
3754 # default FileHandler and allows us to disable the file protocol, which
3755 # can be used for malicious purposes (see
3756 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3757 file_handler = urllib.request.FileHandler()
3758
3759 def file_open(*args, **kwargs):
3760 raise urllib.error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3761 file_handler.file_open = file_open
3762
3763 opener = urllib.request.build_opener(
3764 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3765
3766 # Delete the default user-agent header, which would otherwise apply in
3767 # cases where our custom HTTP handler doesn't come into play
3768 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3769 opener.addheaders = []
3770 self._opener = opener
3771
3772 def encode(self, s):
3773 if isinstance(s, bytes):
3774 return s # Already encoded
3775
3776 try:
3777 return s.encode(self.get_encoding())
3778 except UnicodeEncodeError as err:
3779 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3780 raise
3781
3782 def get_encoding(self):
3783 encoding = self.params.get('encoding')
3784 if encoding is None:
3785 encoding = preferredencoding()
3786 return encoding
3787
3788 def _write_info_json(self, label, ie_result, infofn, overwrite=None):
3789 ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error '''
3790 if overwrite is None:
3791 overwrite = self.params.get('overwrites', True)
3792 if not self.params.get('writeinfojson'):
3793 return False
3794 elif not infofn:
3795 self.write_debug(f'Skipping writing {label} infojson')
3796 return False
3797 elif not self._ensure_dir_exists(infofn):
3798 return None
3799 elif not overwrite and os.path.exists(infofn):
3800 self.to_screen(f'[info] {label.title()} metadata is already present')
3801 return 'exists'
3802
3803 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3804 try:
3805 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3806 return True
3807 except OSError:
3808 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3809 return None
3810
3811 def _write_description(self, label, ie_result, descfn):
3812 ''' Write description and returns True = written, False = skip, None = error '''
3813 if not self.params.get('writedescription'):
3814 return False
3815 elif not descfn:
3816 self.write_debug(f'Skipping writing {label} description')
3817 return False
3818 elif not self._ensure_dir_exists(descfn):
3819 return None
3820 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3821 self.to_screen(f'[info] {label.title()} description is already present')
3822 elif ie_result.get('description') is None:
3823 self.report_warning(f'There\'s no {label} description to write')
3824 return False
3825 else:
3826 try:
3827 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3828 with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3829 descfile.write(ie_result['description'])
3830 except OSError:
3831 self.report_error(f'Cannot write {label} description file {descfn}')
3832 return None
3833 return True
3834
3835 def _write_subtitles(self, info_dict, filename):
3836 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3837 ret = []
3838 subtitles = info_dict.get('requested_subtitles')
3839 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3840 # subtitles download errors are already managed as troubles in relevant IE
3841 # that way it will silently go on when used with unsupporting IE
3842 return ret
3843
3844 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3845 if not sub_filename_base:
3846 self.to_screen('[info] Skipping writing video subtitles')
3847 return ret
3848 for sub_lang, sub_info in subtitles.items():
3849 sub_format = sub_info['ext']
3850 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3851 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3852 existing_sub = self.existing_file((sub_filename_final, sub_filename))
3853 if existing_sub:
3854 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3855 sub_info['filepath'] = existing_sub
3856 ret.append((existing_sub, sub_filename_final))
3857 continue
3858
3859 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3860 if sub_info.get('data') is not None:
3861 try:
3862 # Use newline='' to prevent conversion of newline characters
3863 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3864 with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3865 subfile.write(sub_info['data'])
3866 sub_info['filepath'] = sub_filename
3867 ret.append((sub_filename, sub_filename_final))
3868 continue
3869 except OSError:
3870 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3871 return None
3872
3873 try:
3874 sub_copy = sub_info.copy()
3875 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3876 self.dl(sub_filename, sub_copy, subtitle=True)
3877 sub_info['filepath'] = sub_filename
3878 ret.append((sub_filename, sub_filename_final))
3879 except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3880 msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
3881 if self.params.get('ignoreerrors') is not True: # False or 'only_download'
3882 if not self.params.get('ignoreerrors'):
3883 self.report_error(msg)
3884 raise DownloadError(msg)
3885 self.report_warning(msg)
3886 return ret
3887
3888 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3889 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3890 write_all = self.params.get('write_all_thumbnails', False)
3891 thumbnails, ret = [], []
3892 if write_all or self.params.get('writethumbnail', False):
3893 thumbnails = info_dict.get('thumbnails') or []
3894 multiple = write_all and len(thumbnails) > 1
3895
3896 if thumb_filename_base is None:
3897 thumb_filename_base = filename
3898 if thumbnails and not thumb_filename_base:
3899 self.write_debug(f'Skipping writing {label} thumbnail')
3900 return ret
3901
3902 for idx, t in list(enumerate(thumbnails))[::-1]:
3903 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3904 thumb_display_id = f'{label} thumbnail {t["id"]}'
3905 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3906 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3907
3908 existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
3909 if existing_thumb:
3910 self.to_screen('[info] %s is already present' % (
3911 thumb_display_id if multiple else f'{label} thumbnail').capitalize())
3912 t['filepath'] = existing_thumb
3913 ret.append((existing_thumb, thumb_filename_final))
3914 else:
3915 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3916 try:
3917 uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {})))
3918 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3919 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3920 shutil.copyfileobj(uf, thumbf)
3921 ret.append((thumb_filename, thumb_filename_final))
3922 t['filepath'] = thumb_filename
3923 except network_exceptions as err:
3924 thumbnails.pop(idx)
3925 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3926 if ret and not write_all:
3927 break
3928 return ret