]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
[DailyWire] Add extractors (#4084)
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python3
2 import collections
3 import contextlib
4 import datetime
5 import errno
6 import fileinput
7 import functools
8 import io
9 import itertools
10 import json
11 import locale
12 import operator
13 import os
14 import platform
15 import random
16 import re
17 import shutil
18 import subprocess
19 import sys
20 import tempfile
21 import time
22 import tokenize
23 import traceback
24 import unicodedata
25 import urllib.request
26 from string import ascii_letters
27
28 from .cache import Cache
29 from .compat import (
30 HAS_LEGACY as compat_has_legacy,
31 compat_get_terminal_size,
32 compat_os_name,
33 compat_shlex_quote,
34 compat_str,
35 compat_urllib_error,
36 compat_urllib_request,
37 )
38 from .cookies import load_cookies
39 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
40 from .downloader.rtmp import rtmpdump_version
41 from .extractor import gen_extractor_classes, get_info_extractor
42 from .extractor.openload import PhantomJSwrapper
43 from .minicurses import format_text
44 from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
45 from .postprocessor import (
46 EmbedThumbnailPP,
47 FFmpegFixupDuplicateMoovPP,
48 FFmpegFixupDurationPP,
49 FFmpegFixupM3u8PP,
50 FFmpegFixupM4aPP,
51 FFmpegFixupStretchedPP,
52 FFmpegFixupTimestampPP,
53 FFmpegMergerPP,
54 FFmpegPostProcessor,
55 MoveFilesAfterDownloadPP,
56 get_postprocessor,
57 )
58 from .update import detect_variant
59 from .utils import (
60 DEFAULT_OUTTMPL,
61 LINK_TEMPLATES,
62 NO_DEFAULT,
63 NUMBER_RE,
64 OUTTMPL_TYPES,
65 POSTPROCESS_WHEN,
66 STR_FORMAT_RE_TMPL,
67 STR_FORMAT_TYPES,
68 ContentTooShortError,
69 DateRange,
70 DownloadCancelled,
71 DownloadError,
72 EntryNotInPlaylist,
73 ExistingVideoReached,
74 ExtractorError,
75 GeoRestrictedError,
76 HEADRequest,
77 ISO3166Utils,
78 LazyList,
79 MaxDownloadsReached,
80 Namespace,
81 PagedList,
82 PerRequestProxyHandler,
83 PlaylistEntries,
84 Popen,
85 PostProcessingError,
86 ReExtractInfo,
87 RejectedVideoReached,
88 SameFileError,
89 UnavailableVideoError,
90 YoutubeDLCookieProcessor,
91 YoutubeDLHandler,
92 YoutubeDLRedirectHandler,
93 age_restricted,
94 args_to_str,
95 date_from_str,
96 determine_ext,
97 determine_protocol,
98 encode_compat_str,
99 encodeFilename,
100 error_to_compat_str,
101 expand_path,
102 filter_dict,
103 float_or_none,
104 format_bytes,
105 format_decimal_suffix,
106 format_field,
107 formatSeconds,
108 get_domain,
109 int_or_none,
110 iri_to_uri,
111 join_nonempty,
112 locked_file,
113 make_dir,
114 make_HTTPS_handler,
115 merge_headers,
116 network_exceptions,
117 number_of_digits,
118 orderedSet,
119 parse_filesize,
120 platform_name,
121 preferredencoding,
122 prepend_extension,
123 register_socks_protocols,
124 remove_terminal_sequences,
125 render_table,
126 replace_extension,
127 sanitize_filename,
128 sanitize_path,
129 sanitize_url,
130 sanitized_Request,
131 std_headers,
132 str_or_none,
133 strftime_or_none,
134 subtitles_filename,
135 supports_terminal_sequences,
136 timetuple_from_msec,
137 to_high_limit_path,
138 traverse_obj,
139 try_get,
140 url_basename,
141 variadic,
142 version_tuple,
143 windows_enable_vt_mode,
144 write_json_file,
145 write_string,
146 )
147 from .version import RELEASE_GIT_HEAD, __version__
148
149 if compat_os_name == 'nt':
150 import ctypes
151
152
153 class YoutubeDL:
154 """YoutubeDL class.
155
156 YoutubeDL objects are the ones responsible of downloading the
157 actual video file and writing it to disk if the user has requested
158 it, among some other tasks. In most cases there should be one per
159 program. As, given a video URL, the downloader doesn't know how to
160 extract all the needed information, task that InfoExtractors do, it
161 has to pass the URL to one of them.
162
163 For this, YoutubeDL objects have a method that allows
164 InfoExtractors to be registered in a given order. When it is passed
165 a URL, the YoutubeDL object handles it to the first InfoExtractor it
166 finds that reports being able to handle it. The InfoExtractor extracts
167 all the information about the video or videos the URL refers to, and
168 YoutubeDL process the extracted information, possibly using a File
169 Downloader to download the video.
170
171 YoutubeDL objects accept a lot of parameters. In order not to saturate
172 the object constructor with arguments, it receives a dictionary of
173 options instead. These options are available through the params
174 attribute for the InfoExtractors to use. The YoutubeDL also
175 registers itself as the downloader in charge for the InfoExtractors
176 that are added to it, so this is a "mutual registration".
177
178 Available options:
179
180 username: Username for authentication purposes.
181 password: Password for authentication purposes.
182 videopassword: Password for accessing a video.
183 ap_mso: Adobe Pass multiple-system operator identifier.
184 ap_username: Multiple-system operator account username.
185 ap_password: Multiple-system operator account password.
186 usenetrc: Use netrc for authentication instead.
187 verbose: Print additional info to stdout.
188 quiet: Do not print messages to stdout.
189 no_warnings: Do not print out anything for warnings.
190 forceprint: A dict with keys WHEN mapped to a list of templates to
191 print to stdout. The allowed keys are video or any of the
192 items in utils.POSTPROCESS_WHEN.
193 For compatibility, a single list is also accepted
194 print_to_file: A dict with keys WHEN (same as forceprint) mapped to
195 a list of tuples with (template, filename)
196 forcejson: Force printing info_dict as JSON.
197 dump_single_json: Force printing the info_dict of the whole playlist
198 (or video) as a single JSON line.
199 force_write_download_archive: Force writing download archive regardless
200 of 'skip_download' or 'simulate'.
201 simulate: Do not download the video files. If unset (or None),
202 simulate only if listsubtitles, listformats or list_thumbnails is used
203 format: Video format code. see "FORMAT SELECTION" for more details.
204 You can also pass a function. The function takes 'ctx' as
205 argument and returns the formats to download.
206 See "build_format_selector" for an implementation
207 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
208 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
209 extracting metadata even if the video is not actually
210 available for download (experimental)
211 format_sort: A list of fields by which to sort the video formats.
212 See "Sorting Formats" for more details.
213 format_sort_force: Force the given format_sort. see "Sorting Formats"
214 for more details.
215 prefer_free_formats: Whether to prefer video formats with free containers
216 over non-free ones of same quality.
217 allow_multiple_video_streams: Allow multiple video streams to be merged
218 into a single file
219 allow_multiple_audio_streams: Allow multiple audio streams to be merged
220 into a single file
221 check_formats Whether to test if the formats are downloadable.
222 Can be True (check all), False (check none),
223 'selected' (check selected formats),
224 or None (check only if requested by extractor)
225 paths: Dictionary of output paths. The allowed keys are 'home'
226 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
227 outtmpl: Dictionary of templates for output names. Allowed keys
228 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
229 For compatibility with youtube-dl, a single string can also be used
230 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
231 restrictfilenames: Do not allow "&" and spaces in file names
232 trim_file_name: Limit length of filename (extension excluded)
233 windowsfilenames: Force the filenames to be windows compatible
234 ignoreerrors: Do not stop on download/postprocessing errors.
235 Can be 'only_download' to ignore only download errors.
236 Default is 'only_download' for CLI, but False for API
237 skip_playlist_after_errors: Number of allowed failures until the rest of
238 the playlist is skipped
239 force_generic_extractor: Force downloader to use the generic extractor
240 overwrites: Overwrite all video and metadata files if True,
241 overwrite only non-video files if None
242 and don't overwrite any file if False
243 For compatibility with youtube-dl,
244 "nooverwrites" may also be used instead
245 playlist_items: Specific indices of playlist to download.
246 playlistrandom: Download playlist items in random order.
247 lazy_playlist: Process playlist entries as they are received.
248 matchtitle: Download only matching titles.
249 rejecttitle: Reject downloads for matching titles.
250 logger: Log messages to a logging.Logger instance.
251 logtostderr: Log messages to stderr instead of stdout.
252 consoletitle: Display progress in console window's titlebar.
253 writedescription: Write the video description to a .description file
254 writeinfojson: Write the video description to a .info.json file
255 clean_infojson: Remove private fields from the infojson
256 getcomments: Extract video comments. This will not be written to disk
257 unless writeinfojson is also given
258 writeannotations: Write the video annotations to a .annotations.xml file
259 writethumbnail: Write the thumbnail image to a file
260 allow_playlist_files: Whether to write playlists' description, infojson etc
261 also to disk when using the 'write*' options
262 write_all_thumbnails: Write all thumbnail formats to files
263 writelink: Write an internet shortcut file, depending on the
264 current platform (.url/.webloc/.desktop)
265 writeurllink: Write a Windows internet shortcut file (.url)
266 writewebloclink: Write a macOS internet shortcut file (.webloc)
267 writedesktoplink: Write a Linux internet shortcut file (.desktop)
268 writesubtitles: Write the video subtitles to a file
269 writeautomaticsub: Write the automatically generated subtitles to a file
270 listsubtitles: Lists all available subtitles for the video
271 subtitlesformat: The format code for subtitles
272 subtitleslangs: List of languages of the subtitles to download (can be regex).
273 The list may contain "all" to refer to all the available
274 subtitles. The language can be prefixed with a "-" to
275 exclude it from the requested languages. Eg: ['all', '-live_chat']
276 keepvideo: Keep the video file after post-processing
277 daterange: A DateRange object, download only if the upload_date is in the range.
278 skip_download: Skip the actual download of the video file
279 cachedir: Location of the cache files in the filesystem.
280 False to disable filesystem cache.
281 noplaylist: Download single video instead of a playlist if in doubt.
282 age_limit: An integer representing the user's age in years.
283 Unsuitable videos for the given age are skipped.
284 min_views: An integer representing the minimum view count the video
285 must have in order to not be skipped.
286 Videos without view count information are always
287 downloaded. None for no limit.
288 max_views: An integer representing the maximum view count.
289 Videos that are more popular than that are not
290 downloaded.
291 Videos without view count information are always
292 downloaded. None for no limit.
293 download_archive: File name of a file where all downloads are recorded.
294 Videos already present in the file are not downloaded
295 again.
296 break_on_existing: Stop the download process after attempting to download a
297 file that is in the archive.
298 break_on_reject: Stop the download process when encountering a video that
299 has been filtered out.
300 break_per_url: Whether break_on_reject and break_on_existing
301 should act on each input URL as opposed to for the entire queue
302 cookiefile: File name or text stream from where cookies should be read and dumped to
303 cookiesfrombrowser: A tuple containing the name of the browser, the profile
304 name/pathfrom where cookies are loaded, and the name of the
305 keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
306 legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
307 support RFC 5746 secure renegotiation
308 nocheckcertificate: Do not verify SSL certificates
309 client_certificate: Path to client certificate file in PEM format. May include the private key
310 client_certificate_key: Path to private key file for client certificate
311 client_certificate_password: Password for client certificate private key, if encrypted.
312 If not provided and the key is encrypted, yt-dlp will ask interactively
313 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
314 At the moment, this is only supported by YouTube.
315 http_headers: A dictionary of custom headers to be used for all requests
316 proxy: URL of the proxy server to use
317 geo_verification_proxy: URL of the proxy to use for IP address verification
318 on geo-restricted sites.
319 socket_timeout: Time to wait for unresponsive hosts, in seconds
320 bidi_workaround: Work around buggy terminals without bidirectional text
321 support, using fridibi
322 debug_printtraffic:Print out sent and received HTTP traffic
323 default_search: Prepend this string if an input url is not valid.
324 'auto' for elaborate guessing
325 encoding: Use this encoding instead of the system-specified.
326 extract_flat: Do not resolve URLs, return the immediate result.
327 Pass in 'in_playlist' to only show this behavior for
328 playlist items.
329 wait_for_video: If given, wait for scheduled streams to become available.
330 The value should be a tuple containing the range
331 (min_secs, max_secs) to wait between retries
332 postprocessors: A list of dictionaries, each with an entry
333 * key: The name of the postprocessor. See
334 yt_dlp/postprocessor/__init__.py for a list.
335 * when: When to run the postprocessor. Allowed values are
336 the entries of utils.POSTPROCESS_WHEN
337 Assumed to be 'post_process' if not given
338 progress_hooks: A list of functions that get called on download
339 progress, with a dictionary with the entries
340 * status: One of "downloading", "error", or "finished".
341 Check this first and ignore unknown values.
342 * info_dict: The extracted info_dict
343
344 If status is one of "downloading", or "finished", the
345 following properties may also be present:
346 * filename: The final filename (always present)
347 * tmpfilename: The filename we're currently writing to
348 * downloaded_bytes: Bytes on disk
349 * total_bytes: Size of the whole file, None if unknown
350 * total_bytes_estimate: Guess of the eventual file size,
351 None if unavailable.
352 * elapsed: The number of seconds since download started.
353 * eta: The estimated time in seconds, None if unknown
354 * speed: The download speed in bytes/second, None if
355 unknown
356 * fragment_index: The counter of the currently
357 downloaded video fragment.
358 * fragment_count: The number of fragments (= individual
359 files that will be merged)
360
361 Progress hooks are guaranteed to be called at least once
362 (with status "finished") if the download is successful.
363 postprocessor_hooks: A list of functions that get called on postprocessing
364 progress, with a dictionary with the entries
365 * status: One of "started", "processing", or "finished".
366 Check this first and ignore unknown values.
367 * postprocessor: Name of the postprocessor
368 * info_dict: The extracted info_dict
369
370 Progress hooks are guaranteed to be called at least twice
371 (with status "started" and "finished") if the processing is successful.
372 merge_output_format: Extension to use when merging formats.
373 final_ext: Expected final extension; used to detect when the file was
374 already downloaded and converted
375 fixup: Automatically correct known faults of the file.
376 One of:
377 - "never": do nothing
378 - "warn": only emit a warning
379 - "detect_or_warn": check whether we can do anything
380 about it, warn otherwise (default)
381 source_address: Client-side IP address to bind to.
382 sleep_interval_requests: Number of seconds to sleep between requests
383 during extraction
384 sleep_interval: Number of seconds to sleep before each download when
385 used alone or a lower bound of a range for randomized
386 sleep before each download (minimum possible number
387 of seconds to sleep) when used along with
388 max_sleep_interval.
389 max_sleep_interval:Upper bound of a range for randomized sleep before each
390 download (maximum possible number of seconds to sleep).
391 Must only be used along with sleep_interval.
392 Actual sleep time will be a random float from range
393 [sleep_interval; max_sleep_interval].
394 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
395 listformats: Print an overview of available video formats and exit.
396 list_thumbnails: Print a table of all thumbnails and exit.
397 match_filter: A function that gets called for every video with the signature
398 (info_dict, *, incomplete: bool) -> Optional[str]
399 For backward compatibility with youtube-dl, the signature
400 (info_dict) -> Optional[str] is also allowed.
401 - If it returns a message, the video is ignored.
402 - If it returns None, the video is downloaded.
403 - If it returns utils.NO_DEFAULT, the user is interactively
404 asked whether to download the video.
405 match_filter_func in utils.py is one example for this.
406 no_color: Do not emit color codes in output.
407 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
408 HTTP header
409 geo_bypass_country:
410 Two-letter ISO 3166-2 country code that will be used for
411 explicit geographic restriction bypassing via faking
412 X-Forwarded-For HTTP header
413 geo_bypass_ip_block:
414 IP range in CIDR notation that will be used similarly to
415 geo_bypass_country
416 external_downloader: A dictionary of protocol keys and the executable of the
417 external downloader to use for it. The allowed protocols
418 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
419 Set the value to 'native' to use the native downloader
420 compat_opts: Compatibility options. See "Differences in default behavior".
421 The following options do not work when used through the API:
422 filename, abort-on-error, multistreams, no-live-chat, format-sort
423 no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json.
424 Refer __init__.py for their implementation
425 progress_template: Dictionary of templates for progress outputs.
426 Allowed keys are 'download', 'postprocess',
427 'download-title' (console title) and 'postprocess-title'.
428 The template is mapped on a dictionary with keys 'progress' and 'info'
429 retry_sleep_functions: Dictionary of functions that takes the number of attempts
430 as argument and returns the time to sleep in seconds.
431 Allowed keys are 'http', 'fragment', 'file_access'
432 download_ranges: A function that gets called for every video with the signature
433 (info_dict, *, ydl) -> Iterable[Section].
434 Only the returned sections will be downloaded. Each Section contains:
435 * start_time: Start time of the section in seconds
436 * end_time: End time of the section in seconds
437 * title: Section title (Optional)
438 * index: Section number (Optional)
439
440 The following parameters are not used by YoutubeDL itself, they are used by
441 the downloader (see yt_dlp/downloader/common.py):
442 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
443 max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
444 continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
445 external_downloader_args, concurrent_fragment_downloads.
446
447 The following options are used by the post processors:
448 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
449 to the binary or its containing directory.
450 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
451 and a list of additional command-line arguments for the
452 postprocessor/executable. The dict can also have "PP+EXE" keys
453 which are used when the given exe is used by the given PP.
454 Use 'default' as the name for arguments to passed to all PP
455 For compatibility with youtube-dl, a single list of args
456 can also be used
457
458 The following options are used by the extractors:
459 extractor_retries: Number of times to retry for known errors
460 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
461 hls_split_discontinuity: Split HLS playlists to different formats at
462 discontinuities such as ad breaks (default: False)
463 extractor_args: A dictionary of arguments to be passed to the extractors.
464 See "EXTRACTOR ARGUMENTS" for details.
465 Eg: {'youtube': {'skip': ['dash', 'hls']}}
466 mark_watched: Mark videos watched (even with --simulate). Only for YouTube
467
468 The following options are deprecated and may be removed in the future:
469
470 playliststart: - Use playlist_items
471 Playlist item to start at.
472 playlistend: - Use playlist_items
473 Playlist item to end at.
474 playlistreverse: - Use playlist_items
475 Download playlist items in reverse order.
476 forceurl: - Use forceprint
477 Force printing final URL.
478 forcetitle: - Use forceprint
479 Force printing title.
480 forceid: - Use forceprint
481 Force printing ID.
482 forcethumbnail: - Use forceprint
483 Force printing thumbnail URL.
484 forcedescription: - Use forceprint
485 Force printing description.
486 forcefilename: - Use forceprint
487 Force printing final filename.
488 forceduration: - Use forceprint
489 Force printing duration.
490 allsubtitles: - Use subtitleslangs = ['all']
491 Downloads all the subtitles of the video
492 (requires writesubtitles or writeautomaticsub)
493 include_ads: - Doesn't work
494 Download ads as well
495 call_home: - Not implemented
496 Boolean, true iff we are allowed to contact the
497 yt-dlp servers for debugging.
498 post_hooks: - Register a custom postprocessor
499 A list of functions that get called as the final step
500 for each video file, after all postprocessors have been
501 called. The filename will be passed as the only argument.
502 hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}.
503 Use the native HLS downloader instead of ffmpeg/avconv
504 if True, otherwise use ffmpeg/avconv if False, otherwise
505 use downloader suggested by extractor if None.
506 prefer_ffmpeg: - avconv support is deprecated
507 If False, use avconv instead of ffmpeg if both are available,
508 otherwise prefer ffmpeg.
509 youtube_include_dash_manifest: - Use extractor_args
510 If True (default), DASH manifests and related
511 data will be downloaded and processed by extractor.
512 You can reduce network I/O by disabling it if you don't
513 care about DASH. (only for youtube)
514 youtube_include_hls_manifest: - Use extractor_args
515 If True (default), HLS manifests and related
516 data will be downloaded and processed by extractor.
517 You can reduce network I/O by disabling it if you don't
518 care about HLS. (only for youtube)
519 """
520
521 _NUMERIC_FIELDS = {
522 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
523 'timestamp', 'release_timestamp',
524 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
525 'average_rating', 'comment_count', 'age_limit',
526 'start_time', 'end_time',
527 'chapter_number', 'season_number', 'episode_number',
528 'track_number', 'disc_number', 'release_year',
529 }
530
531 _format_fields = {
532 # NB: Keep in sync with the docstring of extractor/common.py
533 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
534 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr',
535 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
536 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
537 'preference', 'language', 'language_preference', 'quality', 'source_preference',
538 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
539 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
540 }
541 _format_selection_exts = {
542 'audio': {'m4a', 'mp3', 'ogg', 'aac'},
543 'video': {'mp4', 'flv', 'webm', '3gp'},
544 'storyboards': {'mhtml'},
545 }
546
547 def __init__(self, params=None, auto_init=True):
548 """Create a FileDownloader object with the given options.
549 @param auto_init Whether to load the default extractors and print header (if verbose).
550 Set to 'no_verbose_header' to not print the header
551 """
552 if params is None:
553 params = {}
554 self.params = params
555 self._ies = {}
556 self._ies_instances = {}
557 self._pps = {k: [] for k in POSTPROCESS_WHEN}
558 self._printed_messages = set()
559 self._first_webpage_request = True
560 self._post_hooks = []
561 self._progress_hooks = []
562 self._postprocessor_hooks = []
563 self._download_retcode = 0
564 self._num_downloads = 0
565 self._num_videos = 0
566 self._playlist_level = 0
567 self._playlist_urls = set()
568 self.cache = Cache(self)
569
570 windows_enable_vt_mode()
571 stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
572 self._out_files = Namespace(
573 out=stdout,
574 error=sys.stderr,
575 screen=sys.stderr if self.params.get('quiet') else stdout,
576 console=None if compat_os_name == 'nt' else next(
577 filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None)
578 )
579 self._allow_colors = Namespace(**{
580 type_: not self.params.get('no_color') and supports_terminal_sequences(stream)
581 for type_, stream in self._out_files.items_ if type_ != 'console'
582 })
583
584 if sys.version_info < (3, 6):
585 self.report_warning(
586 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
587
588 if self.params.get('allow_unplayable_formats'):
589 self.report_warning(
590 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
591 'This is a developer option intended for debugging. \n'
592 ' If you experience any issues while using this option, '
593 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
594
595 def check_deprecated(param, option, suggestion):
596 if self.params.get(param) is not None:
597 self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
598 return True
599 return False
600
601 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
602 if self.params.get('geo_verification_proxy') is None:
603 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
604
605 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
606 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
607 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
608
609 for msg in self.params.get('_warnings', []):
610 self.report_warning(msg)
611 for msg in self.params.get('_deprecation_warnings', []):
612 self.deprecation_warning(msg)
613
614 self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
615 if not compat_has_legacy:
616 self.params['compat_opts'].add('no-compat-legacy')
617 if 'list-formats' in self.params['compat_opts']:
618 self.params['listformats_table'] = False
619
620 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
621 # nooverwrites was unnecessarily changed to overwrites
622 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
623 # This ensures compatibility with both keys
624 self.params['overwrites'] = not self.params['nooverwrites']
625 elif self.params.get('overwrites') is None:
626 self.params.pop('overwrites', None)
627 else:
628 self.params['nooverwrites'] = not self.params['overwrites']
629
630 self.params.setdefault('forceprint', {})
631 self.params.setdefault('print_to_file', {})
632
633 # Compatibility with older syntax
634 if not isinstance(params['forceprint'], dict):
635 self.params['forceprint'] = {'video': params['forceprint']}
636
637 if self.params.get('bidi_workaround', False):
638 try:
639 import pty
640 master, slave = pty.openpty()
641 width = compat_get_terminal_size().columns
642 width_args = [] if width is None else ['-w', str(width)]
643 sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
644 try:
645 self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
646 except OSError:
647 self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
648 self._output_channel = os.fdopen(master, 'rb')
649 except OSError as ose:
650 if ose.errno == errno.ENOENT:
651 self.report_warning(
652 'Could not find fribidi executable, ignoring --bidi-workaround. '
653 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
654 else:
655 raise
656
657 if auto_init:
658 if auto_init != 'no_verbose_header':
659 self.print_debug_header()
660 self.add_default_info_extractors()
661
662 if (sys.platform != 'win32'
663 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
664 and not self.params.get('restrictfilenames', False)):
665 # Unicode filesystem API will throw errors (#1474, #13027)
666 self.report_warning(
667 'Assuming --restrict-filenames since file system encoding '
668 'cannot encode all characters. '
669 'Set the LC_ALL environment variable to fix this.')
670 self.params['restrictfilenames'] = True
671
672 self._parse_outtmpl()
673
674 # Creating format selector here allows us to catch syntax errors before the extraction
675 self.format_selector = (
676 self.params.get('format') if self.params.get('format') in (None, '-')
677 else self.params['format'] if callable(self.params['format'])
678 else self.build_format_selector(self.params['format']))
679
680 # Set http_headers defaults according to std_headers
681 self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
682
683 hooks = {
684 'post_hooks': self.add_post_hook,
685 'progress_hooks': self.add_progress_hook,
686 'postprocessor_hooks': self.add_postprocessor_hook,
687 }
688 for opt, fn in hooks.items():
689 for ph in self.params.get(opt, []):
690 fn(ph)
691
692 for pp_def_raw in self.params.get('postprocessors', []):
693 pp_def = dict(pp_def_raw)
694 when = pp_def.pop('when', 'post_process')
695 self.add_post_processor(
696 get_postprocessor(pp_def.pop('key'))(self, **pp_def),
697 when=when)
698
699 self._setup_opener()
700 register_socks_protocols()
701
702 def preload_download_archive(fn):
703 """Preload the archive, if any is specified"""
704 if fn is None:
705 return False
706 self.write_debug(f'Loading archive file {fn!r}')
707 try:
708 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
709 for line in archive_file:
710 self.archive.add(line.strip())
711 except OSError as ioe:
712 if ioe.errno != errno.ENOENT:
713 raise
714 return False
715 return True
716
717 self.archive = set()
718 preload_download_archive(self.params.get('download_archive'))
719
720 def warn_if_short_id(self, argv):
721 # short YouTube ID starting with dash?
722 idxs = [
723 i for i, a in enumerate(argv)
724 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
725 if idxs:
726 correct_argv = (
727 ['yt-dlp']
728 + [a for i, a in enumerate(argv) if i not in idxs]
729 + ['--'] + [argv[i] for i in idxs]
730 )
731 self.report_warning(
732 'Long argument string detected. '
733 'Use -- to separate parameters and URLs, like this:\n%s' %
734 args_to_str(correct_argv))
735
736 def add_info_extractor(self, ie):
737 """Add an InfoExtractor object to the end of the list."""
738 ie_key = ie.ie_key()
739 self._ies[ie_key] = ie
740 if not isinstance(ie, type):
741 self._ies_instances[ie_key] = ie
742 ie.set_downloader(self)
743
744 def _get_info_extractor_class(self, ie_key):
745 ie = self._ies.get(ie_key)
746 if ie is None:
747 ie = get_info_extractor(ie_key)
748 self.add_info_extractor(ie)
749 return ie
750
751 def get_info_extractor(self, ie_key):
752 """
753 Get an instance of an IE with name ie_key, it will try to get one from
754 the _ies list, if there's no instance it will create a new one and add
755 it to the extractor list.
756 """
757 ie = self._ies_instances.get(ie_key)
758 if ie is None:
759 ie = get_info_extractor(ie_key)()
760 self.add_info_extractor(ie)
761 return ie
762
763 def add_default_info_extractors(self):
764 """
765 Add the InfoExtractors returned by gen_extractors to the end of the list
766 """
767 for ie in gen_extractor_classes():
768 self.add_info_extractor(ie)
769
770 def add_post_processor(self, pp, when='post_process'):
771 """Add a PostProcessor object to the end of the chain."""
772 self._pps[when].append(pp)
773 pp.set_downloader(self)
774
775 def add_post_hook(self, ph):
776 """Add the post hook"""
777 self._post_hooks.append(ph)
778
779 def add_progress_hook(self, ph):
780 """Add the download progress hook"""
781 self._progress_hooks.append(ph)
782
783 def add_postprocessor_hook(self, ph):
784 """Add the postprocessing progress hook"""
785 self._postprocessor_hooks.append(ph)
786 for pps in self._pps.values():
787 for pp in pps:
788 pp.add_progress_hook(ph)
789
790 def _bidi_workaround(self, message):
791 if not hasattr(self, '_output_channel'):
792 return message
793
794 assert hasattr(self, '_output_process')
795 assert isinstance(message, compat_str)
796 line_count = message.count('\n') + 1
797 self._output_process.stdin.write((message + '\n').encode())
798 self._output_process.stdin.flush()
799 res = ''.join(self._output_channel.readline().decode()
800 for _ in range(line_count))
801 return res[:-len('\n')]
802
803 def _write_string(self, message, out=None, only_once=False):
804 if only_once:
805 if message in self._printed_messages:
806 return
807 self._printed_messages.add(message)
808 write_string(message, out=out, encoding=self.params.get('encoding'))
809
810 def to_stdout(self, message, skip_eol=False, quiet=None):
811 """Print message to stdout"""
812 if quiet is not None:
813 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead')
814 if skip_eol is not False:
815 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. Use "YoutubeDL.to_screen" instead')
816 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out)
817
818 def to_screen(self, message, skip_eol=False, quiet=None):
819 """Print message to screen if not in quiet mode"""
820 if self.params.get('logger'):
821 self.params['logger'].debug(message)
822 return
823 if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'):
824 return
825 self._write_string(
826 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
827 self._out_files.screen)
828
829 def to_stderr(self, message, only_once=False):
830 """Print message to stderr"""
831 assert isinstance(message, compat_str)
832 if self.params.get('logger'):
833 self.params['logger'].error(message)
834 else:
835 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once)
836
837 def _send_console_code(self, code):
838 if compat_os_name == 'nt' or not self._out_files.console:
839 return
840 self._write_string(code, self._out_files.console)
841
842 def to_console_title(self, message):
843 if not self.params.get('consoletitle', False):
844 return
845 message = remove_terminal_sequences(message)
846 if compat_os_name == 'nt':
847 if ctypes.windll.kernel32.GetConsoleWindow():
848 # c_wchar_p() might not be necessary if `message` is
849 # already of type unicode()
850 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
851 else:
852 self._send_console_code(f'\033]0;{message}\007')
853
854 def save_console_title(self):
855 if not self.params.get('consoletitle') or self.params.get('simulate'):
856 return
857 self._send_console_code('\033[22;0t') # Save the title on stack
858
859 def restore_console_title(self):
860 if not self.params.get('consoletitle') or self.params.get('simulate'):
861 return
862 self._send_console_code('\033[23;0t') # Restore the title from stack
863
864 def __enter__(self):
865 self.save_console_title()
866 return self
867
868 def __exit__(self, *args):
869 self.restore_console_title()
870
871 if self.params.get('cookiefile') is not None:
872 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
873
874 def trouble(self, message=None, tb=None, is_error=True):
875 """Determine action to take when a download problem appears.
876
877 Depending on if the downloader has been configured to ignore
878 download errors or not, this method may throw an exception or
879 not when errors are found, after printing the message.
880
881 @param tb If given, is additional traceback information
882 @param is_error Whether to raise error according to ignorerrors
883 """
884 if message is not None:
885 self.to_stderr(message)
886 if self.params.get('verbose'):
887 if tb is None:
888 if sys.exc_info()[0]: # if .trouble has been called from an except block
889 tb = ''
890 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
891 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
892 tb += encode_compat_str(traceback.format_exc())
893 else:
894 tb_data = traceback.format_list(traceback.extract_stack())
895 tb = ''.join(tb_data)
896 if tb:
897 self.to_stderr(tb)
898 if not is_error:
899 return
900 if not self.params.get('ignoreerrors'):
901 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
902 exc_info = sys.exc_info()[1].exc_info
903 else:
904 exc_info = sys.exc_info()
905 raise DownloadError(message, exc_info)
906 self._download_retcode = 1
907
908 Styles = Namespace(
909 HEADERS='yellow',
910 EMPHASIS='light blue',
911 FILENAME='green',
912 ID='green',
913 DELIM='blue',
914 ERROR='red',
915 WARNING='yellow',
916 SUPPRESS='light black',
917 )
918
919 def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
920 text = str(text)
921 if test_encoding:
922 original_text = text
923 # handle.encoding can be None. See https://github.com/yt-dlp/yt-dlp/issues/2711
924 encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii'
925 text = text.encode(encoding, 'ignore').decode(encoding)
926 if fallback is not None and text != original_text:
927 text = fallback
928 return format_text(text, f) if allow_colors else text if fallback is None else fallback
929
930 def _format_out(self, *args, **kwargs):
931 return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs)
932
933 def _format_screen(self, *args, **kwargs):
934 return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs)
935
936 def _format_err(self, *args, **kwargs):
937 return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs)
938
939 def report_warning(self, message, only_once=False):
940 '''
941 Print the message to stderr, it will be prefixed with 'WARNING:'
942 If stderr is a tty file the 'WARNING:' will be colored
943 '''
944 if self.params.get('logger') is not None:
945 self.params['logger'].warning(message)
946 else:
947 if self.params.get('no_warnings'):
948 return
949 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
950
951 def deprecation_warning(self, message):
952 if self.params.get('logger') is not None:
953 self.params['logger'].warning(f'DeprecationWarning: {message}')
954 else:
955 self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True)
956
957 def report_error(self, message, *args, **kwargs):
958 '''
959 Do the same as trouble, but prefixes the message with 'ERROR:', colored
960 in red if stderr is a tty file.
961 '''
962 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
963
964 def write_debug(self, message, only_once=False):
965 '''Log debug message or Print message to stderr'''
966 if not self.params.get('verbose', False):
967 return
968 message = f'[debug] {message}'
969 if self.params.get('logger'):
970 self.params['logger'].debug(message)
971 else:
972 self.to_stderr(message, only_once)
973
974 def report_file_already_downloaded(self, file_name):
975 """Report file has already been fully downloaded."""
976 try:
977 self.to_screen('[download] %s has already been downloaded' % file_name)
978 except UnicodeEncodeError:
979 self.to_screen('[download] The file has already been downloaded')
980
981 def report_file_delete(self, file_name):
982 """Report that existing file will be deleted."""
983 try:
984 self.to_screen('Deleting existing file %s' % file_name)
985 except UnicodeEncodeError:
986 self.to_screen('Deleting existing file')
987
988 def raise_no_formats(self, info, forced=False, *, msg=None):
989 has_drm = info.get('_has_drm')
990 ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
991 msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!'
992 if forced or not ignored:
993 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
994 expected=has_drm or ignored or expected)
995 else:
996 self.report_warning(msg)
997
998 def parse_outtmpl(self):
999 self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version')
1000 self._parse_outtmpl()
1001 return self.params['outtmpl']
1002
1003 def _parse_outtmpl(self):
1004 sanitize = lambda x: x
1005 if self.params.get('restrictfilenames'): # Remove spaces in the default template
1006 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
1007
1008 outtmpl = self.params.setdefault('outtmpl', {})
1009 if not isinstance(outtmpl, dict):
1010 self.params['outtmpl'] = outtmpl = {'default': outtmpl}
1011 outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None})
1012
1013 def get_output_path(self, dir_type='', filename=None):
1014 paths = self.params.get('paths', {})
1015 assert isinstance(paths, dict)
1016 path = os.path.join(
1017 expand_path(paths.get('home', '').strip()),
1018 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
1019 filename or '')
1020 return sanitize_path(path, force=self.params.get('windowsfilenames'))
1021
1022 @staticmethod
1023 def _outtmpl_expandpath(outtmpl):
1024 # expand_path translates '%%' into '%' and '$$' into '$'
1025 # correspondingly that is not what we want since we need to keep
1026 # '%%' intact for template dict substitution step. Working around
1027 # with boundary-alike separator hack.
1028 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
1029 outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$')
1030
1031 # outtmpl should be expand_path'ed before template dict substitution
1032 # because meta fields may contain env variables we don't want to
1033 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
1034 # title "Hello $PATH", we don't want `$PATH` to be expanded.
1035 return expand_path(outtmpl).replace(sep, '')
1036
1037 @staticmethod
1038 def escape_outtmpl(outtmpl):
1039 ''' Escape any remaining strings like %s, %abc% etc. '''
1040 return re.sub(
1041 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
1042 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
1043 outtmpl)
1044
1045 @classmethod
1046 def validate_outtmpl(cls, outtmpl):
1047 ''' @return None or Exception object '''
1048 outtmpl = re.sub(
1049 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDS]'),
1050 lambda mobj: f'{mobj.group(0)[:-1]}s',
1051 cls._outtmpl_expandpath(outtmpl))
1052 try:
1053 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
1054 return None
1055 except ValueError as err:
1056 return err
1057
1058 @staticmethod
1059 def _copy_infodict(info_dict):
1060 info_dict = dict(info_dict)
1061 info_dict.pop('__postprocessors', None)
1062 info_dict.pop('__pending_error', None)
1063 return info_dict
1064
1065 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
1066 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
1067 @param sanitize Whether to sanitize the output as a filename.
1068 For backward compatibility, a function can also be passed
1069 """
1070
1071 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1072
1073 info_dict = self._copy_infodict(info_dict)
1074 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1075 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1076 if info_dict.get('duration', None) is not None
1077 else None)
1078 info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads)
1079 info_dict['video_autonumber'] = self._num_videos
1080 if info_dict.get('resolution') is None:
1081 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1082
1083 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1084 # of %(field)s to %(field)0Nd for backward compatibility
1085 field_size_compat_map = {
1086 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0),
1087 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1088 'autonumber': self.params.get('autonumber_size') or 5,
1089 }
1090
1091 TMPL_DICT = {}
1092 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDS]'))
1093 MATH_FUNCTIONS = {
1094 '+': float.__add__,
1095 '-': float.__sub__,
1096 }
1097 # Field is of the form key1.key2...
1098 # where keys (except first) can be string, int or slice
1099 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
1100 MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
1101 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
1102 INTERNAL_FORMAT_RE = re.compile(rf'''(?x)
1103 (?P<negate>-)?
1104 (?P<fields>{FIELD_RE})
1105 (?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
1106 (?:>(?P<strf_format>.+?))?
1107 (?P<remaining>
1108 (?P<alternate>(?<!\\),[^|&)]+)?
1109 (?:&(?P<replacement>.*?))?
1110 (?:\|(?P<default>.*?))?
1111 )$''')
1112
1113 def _traverse_infodict(k):
1114 k = k.split('.')
1115 if k[0] == '':
1116 k.pop(0)
1117 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
1118
1119 def get_value(mdict):
1120 # Object traversal
1121 value = _traverse_infodict(mdict['fields'])
1122 # Negative
1123 if mdict['negate']:
1124 value = float_or_none(value)
1125 if value is not None:
1126 value *= -1
1127 # Do maths
1128 offset_key = mdict['maths']
1129 if offset_key:
1130 value = float_or_none(value)
1131 operator = None
1132 while offset_key:
1133 item = re.match(
1134 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1135 offset_key).group(0)
1136 offset_key = offset_key[len(item):]
1137 if operator is None:
1138 operator = MATH_FUNCTIONS[item]
1139 continue
1140 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1141 offset = float_or_none(item)
1142 if offset is None:
1143 offset = float_or_none(_traverse_infodict(item))
1144 try:
1145 value = operator(value, multiplier * offset)
1146 except (TypeError, ZeroDivisionError):
1147 return None
1148 operator = None
1149 # Datetime formatting
1150 if mdict['strf_format']:
1151 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1152
1153 return value
1154
1155 na = self.params.get('outtmpl_na_placeholder', 'NA')
1156
1157 def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
1158 return sanitize_filename(str(value), restricted=restricted, is_id=(
1159 bool(re.search(r'(^|[_.])id(\.|$)', key))
1160 if 'filename-sanitization' in self.params['compat_opts']
1161 else NO_DEFAULT))
1162
1163 sanitizer = sanitize if callable(sanitize) else filename_sanitizer
1164 sanitize = bool(sanitize)
1165
1166 def _dumpjson_default(obj):
1167 if isinstance(obj, (set, LazyList)):
1168 return list(obj)
1169 return repr(obj)
1170
1171 def create_key(outer_mobj):
1172 if not outer_mobj.group('has_key'):
1173 return outer_mobj.group(0)
1174 key = outer_mobj.group('key')
1175 mobj = re.match(INTERNAL_FORMAT_RE, key)
1176 initial_field = mobj.group('fields') if mobj else ''
1177 value, replacement, default = None, None, na
1178 while mobj:
1179 mobj = mobj.groupdict()
1180 default = mobj['default'] if mobj['default'] is not None else default
1181 value = get_value(mobj)
1182 replacement = mobj['replacement']
1183 if value is None and mobj['alternate']:
1184 mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
1185 else:
1186 break
1187
1188 fmt = outer_mobj.group('format')
1189 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1190 fmt = f'0{field_size_compat_map[key]:d}d'
1191
1192 value = default if value is None else value if replacement is None else replacement
1193
1194 flags = outer_mobj.group('conversion') or ''
1195 str_fmt = f'{fmt[:-1]}s'
1196 if fmt[-1] == 'l': # list
1197 delim = '\n' if '#' in flags else ', '
1198 value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
1199 elif fmt[-1] == 'j': # json
1200 value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
1201 elif fmt[-1] == 'q': # quoted
1202 value = map(str, variadic(value) if '#' in flags else [value])
1203 value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
1204 elif fmt[-1] == 'B': # bytes
1205 value = f'%{str_fmt}'.encode() % str(value).encode()
1206 value, fmt = value.decode('utf-8', 'ignore'), 's'
1207 elif fmt[-1] == 'U': # unicode normalized
1208 value, fmt = unicodedata.normalize(
1209 # "+" = compatibility equivalence, "#" = NFD
1210 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1211 value), str_fmt
1212 elif fmt[-1] == 'D': # decimal suffix
1213 num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
1214 value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
1215 factor=1024 if '#' in flags else 1000)
1216 elif fmt[-1] == 'S': # filename sanitization
1217 value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
1218 elif fmt[-1] == 'c':
1219 if value:
1220 value = str(value)[0]
1221 else:
1222 fmt = str_fmt
1223 elif fmt[-1] not in 'rs': # numeric
1224 value = float_or_none(value)
1225 if value is None:
1226 value, fmt = default, 's'
1227
1228 if sanitize:
1229 if fmt[-1] == 'r':
1230 # If value is an object, sanitize might convert it to a string
1231 # So we convert it to repr first
1232 value, fmt = repr(value), str_fmt
1233 if fmt[-1] in 'csr':
1234 value = sanitizer(initial_field, value)
1235
1236 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1237 TMPL_DICT[key] = value
1238 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1239
1240 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1241
1242 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1243 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1244 return self.escape_outtmpl(outtmpl) % info_dict
1245
1246 def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
1247 assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
1248 if outtmpl is None:
1249 outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default'])
1250 try:
1251 outtmpl = self._outtmpl_expandpath(outtmpl)
1252 filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
1253 if not filename:
1254 return None
1255
1256 if tmpl_type in ('', 'temp'):
1257 final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
1258 if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
1259 filename = replace_extension(filename, ext, final_ext)
1260 elif tmpl_type:
1261 force_ext = OUTTMPL_TYPES[tmpl_type]
1262 if force_ext:
1263 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1264
1265 # https://github.com/blackjack4494/youtube-dlc/issues/85
1266 trim_file_name = self.params.get('trim_file_name', False)
1267 if trim_file_name:
1268 no_ext, *ext = filename.rsplit('.', 2)
1269 filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
1270
1271 return filename
1272 except ValueError as err:
1273 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1274 return None
1275
1276 def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
1277 """Generate the output filename"""
1278 if outtmpl:
1279 assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
1280 dir_type = None
1281 filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
1282 if not filename and dir_type not in ('', 'temp'):
1283 return ''
1284
1285 if warn:
1286 if not self.params.get('paths'):
1287 pass
1288 elif filename == '-':
1289 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1290 elif os.path.isabs(filename):
1291 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1292 if filename == '-' or not filename:
1293 return filename
1294
1295 return self.get_output_path(dir_type, filename)
1296
1297 def _match_entry(self, info_dict, incomplete=False, silent=False):
1298 """ Returns None if the file should be downloaded """
1299
1300 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1301
1302 def check_filter():
1303 if 'title' in info_dict:
1304 # This can happen when we're just evaluating the playlist
1305 title = info_dict['title']
1306 matchtitle = self.params.get('matchtitle', False)
1307 if matchtitle:
1308 if not re.search(matchtitle, title, re.IGNORECASE):
1309 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1310 rejecttitle = self.params.get('rejecttitle', False)
1311 if rejecttitle:
1312 if re.search(rejecttitle, title, re.IGNORECASE):
1313 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1314 date = info_dict.get('upload_date')
1315 if date is not None:
1316 dateRange = self.params.get('daterange', DateRange())
1317 if date not in dateRange:
1318 return f'{date_from_str(date).isoformat()} upload date is not in range {dateRange}'
1319 view_count = info_dict.get('view_count')
1320 if view_count is not None:
1321 min_views = self.params.get('min_views')
1322 if min_views is not None and view_count < min_views:
1323 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1324 max_views = self.params.get('max_views')
1325 if max_views is not None and view_count > max_views:
1326 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1327 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1328 return 'Skipping "%s" because it is age restricted' % video_title
1329
1330 match_filter = self.params.get('match_filter')
1331 if match_filter is not None:
1332 try:
1333 ret = match_filter(info_dict, incomplete=incomplete)
1334 except TypeError:
1335 # For backward compatibility
1336 ret = None if incomplete else match_filter(info_dict)
1337 if ret is NO_DEFAULT:
1338 while True:
1339 filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
1340 reply = input(self._format_screen(
1341 f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
1342 if reply in {'y', ''}:
1343 return None
1344 elif reply == 'n':
1345 return f'Skipping {video_title}'
1346 elif ret is not None:
1347 return ret
1348 return None
1349
1350 if self.in_download_archive(info_dict):
1351 reason = '%s has already been recorded in the archive' % video_title
1352 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1353 else:
1354 reason = check_filter()
1355 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1356 if reason is not None:
1357 if not silent:
1358 self.to_screen('[download] ' + reason)
1359 if self.params.get(break_opt, False):
1360 raise break_err()
1361 return reason
1362
1363 @staticmethod
1364 def add_extra_info(info_dict, extra_info):
1365 '''Set the keys from extra_info in info dict if they are missing'''
1366 for key, value in extra_info.items():
1367 info_dict.setdefault(key, value)
1368
1369 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1370 process=True, force_generic_extractor=False):
1371 """
1372 Return a list with a dictionary for each video extracted.
1373
1374 Arguments:
1375 url -- URL to extract
1376
1377 Keyword arguments:
1378 download -- whether to download videos during extraction
1379 ie_key -- extractor key hint
1380 extra_info -- dictionary containing the extra values to add to each result
1381 process -- whether to resolve all unresolved references (URLs, playlist items),
1382 must be True for download to work.
1383 force_generic_extractor -- force using the generic extractor
1384 """
1385
1386 if extra_info is None:
1387 extra_info = {}
1388
1389 if not ie_key and force_generic_extractor:
1390 ie_key = 'Generic'
1391
1392 if ie_key:
1393 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1394 else:
1395 ies = self._ies
1396
1397 for ie_key, ie in ies.items():
1398 if not ie.suitable(url):
1399 continue
1400
1401 if not ie.working():
1402 self.report_warning('The program functionality for this site has been marked as broken, '
1403 'and will probably not work.')
1404
1405 temp_id = ie.get_temp_id(url)
1406 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1407 self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
1408 if self.params.get('break_on_existing', False):
1409 raise ExistingVideoReached()
1410 break
1411 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1412 else:
1413 self.report_error('no suitable InfoExtractor for URL %s' % url)
1414
1415 def _handle_extraction_exceptions(func):
1416 @functools.wraps(func)
1417 def wrapper(self, *args, **kwargs):
1418 while True:
1419 try:
1420 return func(self, *args, **kwargs)
1421 except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
1422 raise
1423 except ReExtractInfo as e:
1424 if e.expected:
1425 self.to_screen(f'{e}; Re-extracting data')
1426 else:
1427 self.to_stderr('\r')
1428 self.report_warning(f'{e}; Re-extracting data')
1429 continue
1430 except GeoRestrictedError as e:
1431 msg = e.msg
1432 if e.countries:
1433 msg += '\nThis video is available in %s.' % ', '.join(
1434 map(ISO3166Utils.short2full, e.countries))
1435 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1436 self.report_error(msg)
1437 except ExtractorError as e: # An error we somewhat expected
1438 self.report_error(str(e), e.format_traceback())
1439 except Exception as e:
1440 if self.params.get('ignoreerrors'):
1441 self.report_error(str(e), tb=encode_compat_str(traceback.format_exc()))
1442 else:
1443 raise
1444 break
1445 return wrapper
1446
1447 def _wait_for_video(self, ie_result):
1448 if (not self.params.get('wait_for_video')
1449 or ie_result.get('_type', 'video') != 'video'
1450 or ie_result.get('formats') or ie_result.get('url')):
1451 return
1452
1453 format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
1454 last_msg = ''
1455
1456 def progress(msg):
1457 nonlocal last_msg
1458 self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True)
1459 last_msg = msg
1460
1461 min_wait, max_wait = self.params.get('wait_for_video')
1462 diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
1463 if diff is None and ie_result.get('live_status') == 'is_upcoming':
1464 diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
1465 self.report_warning('Release time of video is not known')
1466 elif (diff or 0) <= 0:
1467 self.report_warning('Video should already be available according to extracted info')
1468 diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
1469 self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
1470
1471 wait_till = time.time() + diff
1472 try:
1473 while True:
1474 diff = wait_till - time.time()
1475 if diff <= 0:
1476 progress('')
1477 raise ReExtractInfo('[wait] Wait period ended', expected=True)
1478 progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
1479 time.sleep(1)
1480 except KeyboardInterrupt:
1481 progress('')
1482 raise ReExtractInfo('[wait] Interrupted by user', expected=True)
1483 except BaseException as e:
1484 if not isinstance(e, ReExtractInfo):
1485 self.to_screen('')
1486 raise
1487
1488 @_handle_extraction_exceptions
1489 def __extract_info(self, url, ie, download, extra_info, process):
1490 ie_result = ie.extract(url)
1491 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1492 return
1493 if isinstance(ie_result, list):
1494 # Backwards compatibility: old IE result format
1495 ie_result = {
1496 '_type': 'compat_list',
1497 'entries': ie_result,
1498 }
1499 if extra_info.get('original_url'):
1500 ie_result.setdefault('original_url', extra_info['original_url'])
1501 self.add_default_extra_info(ie_result, ie, url)
1502 if process:
1503 self._wait_for_video(ie_result)
1504 return self.process_ie_result(ie_result, download, extra_info)
1505 else:
1506 return ie_result
1507
1508 def add_default_extra_info(self, ie_result, ie, url):
1509 if url is not None:
1510 self.add_extra_info(ie_result, {
1511 'webpage_url': url,
1512 'original_url': url,
1513 })
1514 webpage_url = ie_result.get('webpage_url')
1515 if webpage_url:
1516 self.add_extra_info(ie_result, {
1517 'webpage_url_basename': url_basename(webpage_url),
1518 'webpage_url_domain': get_domain(webpage_url),
1519 })
1520 if ie is not None:
1521 self.add_extra_info(ie_result, {
1522 'extractor': ie.IE_NAME,
1523 'extractor_key': ie.ie_key(),
1524 })
1525
1526 def process_ie_result(self, ie_result, download=True, extra_info=None):
1527 """
1528 Take the result of the ie(may be modified) and resolve all unresolved
1529 references (URLs, playlist items).
1530
1531 It will also download the videos if 'download'.
1532 Returns the resolved ie_result.
1533 """
1534 if extra_info is None:
1535 extra_info = {}
1536 result_type = ie_result.get('_type', 'video')
1537
1538 if result_type in ('url', 'url_transparent'):
1539 ie_result['url'] = sanitize_url(ie_result['url'])
1540 if ie_result.get('original_url'):
1541 extra_info.setdefault('original_url', ie_result['original_url'])
1542
1543 extract_flat = self.params.get('extract_flat', False)
1544 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1545 or extract_flat is True):
1546 info_copy = ie_result.copy()
1547 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1548 if ie and not ie_result.get('id'):
1549 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1550 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1551 self.add_extra_info(info_copy, extra_info)
1552 info_copy, _ = self.pre_process(info_copy)
1553 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1554 self._raise_pending_errors(info_copy)
1555 if self.params.get('force_write_download_archive', False):
1556 self.record_download_archive(info_copy)
1557 return ie_result
1558
1559 if result_type == 'video':
1560 self.add_extra_info(ie_result, extra_info)
1561 ie_result = self.process_video_result(ie_result, download=download)
1562 self._raise_pending_errors(ie_result)
1563 additional_urls = (ie_result or {}).get('additional_urls')
1564 if additional_urls:
1565 # TODO: Improve MetadataParserPP to allow setting a list
1566 if isinstance(additional_urls, compat_str):
1567 additional_urls = [additional_urls]
1568 self.to_screen(
1569 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1570 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1571 ie_result['additional_entries'] = [
1572 self.extract_info(
1573 url, download, extra_info=extra_info,
1574 force_generic_extractor=self.params.get('force_generic_extractor'))
1575 for url in additional_urls
1576 ]
1577 return ie_result
1578 elif result_type == 'url':
1579 # We have to add extra_info to the results because it may be
1580 # contained in a playlist
1581 return self.extract_info(
1582 ie_result['url'], download,
1583 ie_key=ie_result.get('ie_key'),
1584 extra_info=extra_info)
1585 elif result_type == 'url_transparent':
1586 # Use the information from the embedding page
1587 info = self.extract_info(
1588 ie_result['url'], ie_key=ie_result.get('ie_key'),
1589 extra_info=extra_info, download=False, process=False)
1590
1591 # extract_info may return None when ignoreerrors is enabled and
1592 # extraction failed with an error, don't crash and return early
1593 # in this case
1594 if not info:
1595 return info
1596
1597 new_result = info.copy()
1598 new_result.update(filter_dict(ie_result, lambda k, v: (
1599 v is not None and k not in {'_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'})))
1600
1601 # Extracted info may not be a video result (i.e.
1602 # info.get('_type', 'video') != video) but rather an url or
1603 # url_transparent. In such cases outer metadata (from ie_result)
1604 # should be propagated to inner one (info). For this to happen
1605 # _type of info should be overridden with url_transparent. This
1606 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1607 if new_result.get('_type') == 'url':
1608 new_result['_type'] = 'url_transparent'
1609
1610 return self.process_ie_result(
1611 new_result, download=download, extra_info=extra_info)
1612 elif result_type in ('playlist', 'multi_video'):
1613 # Protect from infinite recursion due to recursively nested playlists
1614 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1615 webpage_url = ie_result['webpage_url']
1616 if webpage_url in self._playlist_urls:
1617 self.to_screen(
1618 '[download] Skipping already downloaded playlist: %s'
1619 % ie_result.get('title') or ie_result.get('id'))
1620 return
1621
1622 self._playlist_level += 1
1623 self._playlist_urls.add(webpage_url)
1624 self._fill_common_fields(ie_result, False)
1625 self._sanitize_thumbnails(ie_result)
1626 try:
1627 return self.__process_playlist(ie_result, download)
1628 finally:
1629 self._playlist_level -= 1
1630 if not self._playlist_level:
1631 self._playlist_urls.clear()
1632 elif result_type == 'compat_list':
1633 self.report_warning(
1634 'Extractor %s returned a compat_list result. '
1635 'It needs to be updated.' % ie_result.get('extractor'))
1636
1637 def _fixup(r):
1638 self.add_extra_info(r, {
1639 'extractor': ie_result['extractor'],
1640 'webpage_url': ie_result['webpage_url'],
1641 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1642 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1643 'extractor_key': ie_result['extractor_key'],
1644 })
1645 return r
1646 ie_result['entries'] = [
1647 self.process_ie_result(_fixup(r), download, extra_info)
1648 for r in ie_result['entries']
1649 ]
1650 return ie_result
1651 else:
1652 raise Exception('Invalid result type: %s' % result_type)
1653
1654 def _ensure_dir_exists(self, path):
1655 return make_dir(path, self.report_error)
1656
1657 @staticmethod
1658 def _playlist_infodict(ie_result, **kwargs):
1659 return {
1660 **ie_result,
1661 'playlist': ie_result.get('title') or ie_result.get('id'),
1662 'playlist_id': ie_result.get('id'),
1663 'playlist_title': ie_result.get('title'),
1664 'playlist_uploader': ie_result.get('uploader'),
1665 'playlist_uploader_id': ie_result.get('uploader_id'),
1666 'playlist_index': 0,
1667 **kwargs,
1668 }
1669
1670 def __process_playlist(self, ie_result, download):
1671 """Process each entry in the playlist"""
1672 title = ie_result.get('title') or ie_result.get('id') or '<Untitled>'
1673 self.to_screen(f'[download] Downloading playlist: {title}')
1674
1675 all_entries = PlaylistEntries(self, ie_result)
1676 entries = orderedSet(all_entries.get_requested_items(), lazy=True)
1677
1678 lazy = self.params.get('lazy_playlist')
1679 if lazy:
1680 resolved_entries, n_entries = [], 'N/A'
1681 ie_result['requested_entries'], ie_result['entries'] = None, None
1682 else:
1683 entries = resolved_entries = list(entries)
1684 n_entries = len(resolved_entries)
1685 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1686 if not ie_result.get('playlist_count'):
1687 # Better to do this after potentially exhausting entries
1688 ie_result['playlist_count'] = all_entries.get_full_count()
1689
1690 _infojson_written = False
1691 write_playlist_files = self.params.get('allow_playlist_files', True)
1692 if write_playlist_files and self.params.get('list_thumbnails'):
1693 self.list_thumbnails(ie_result)
1694 if write_playlist_files and not self.params.get('simulate'):
1695 ie_copy = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
1696 _infojson_written = self._write_info_json(
1697 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
1698 if _infojson_written is None:
1699 return
1700 if self._write_description('playlist', ie_result,
1701 self.prepare_filename(ie_copy, 'pl_description')) is None:
1702 return
1703 # TODO: This should be passed to ThumbnailsConvertor if necessary
1704 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1705
1706 if lazy:
1707 if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
1708 self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True)
1709 elif self.params.get('playlistreverse'):
1710 entries.reverse()
1711 elif self.params.get('playlistrandom'):
1712 random.shuffle(entries)
1713
1714 self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} videos'
1715 f'{format_field(ie_result, "playlist_count", " of %s")}')
1716
1717 failures = 0
1718 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1719 for i, (playlist_index, entry) in enumerate(entries):
1720 if lazy:
1721 resolved_entries.append((playlist_index, entry))
1722
1723 # TODO: Add auto-generated fields
1724 if self._match_entry(entry, incomplete=True) is not None:
1725 continue
1726
1727 self.to_screen('[download] Downloading video %s of %s' % (
1728 self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
1729
1730 entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
1731 if not lazy and 'playlist-index' in self.params.get('compat_opts', []):
1732 playlist_index = ie_result['requested_entries'][i]
1733
1734 entry_result = self.__process_iterable_entry(entry, download, {
1735 'n_entries': int_or_none(n_entries),
1736 '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
1737 'playlist_count': ie_result.get('playlist_count'),
1738 'playlist_index': playlist_index,
1739 'playlist_autonumber': i + 1,
1740 'playlist': title,
1741 'playlist_id': ie_result.get('id'),
1742 'playlist_title': ie_result.get('title'),
1743 'playlist_uploader': ie_result.get('uploader'),
1744 'playlist_uploader_id': ie_result.get('uploader_id'),
1745 'extractor': ie_result['extractor'],
1746 'webpage_url': ie_result['webpage_url'],
1747 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1748 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1749 'extractor_key': ie_result['extractor_key'],
1750 })
1751 if not entry_result:
1752 failures += 1
1753 if failures >= max_failures:
1754 self.report_error(
1755 f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction')
1756 break
1757 resolved_entries[i] = (playlist_index, entry_result)
1758
1759 # Update with processed data
1760 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1761
1762 # Write the updated info to json
1763 if _infojson_written is True and self._write_info_json(
1764 'updated playlist', ie_result,
1765 self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
1766 return
1767
1768 ie_result = self.run_all_pps('playlist', ie_result)
1769 self.to_screen(f'[download] Finished downloading playlist: {title}')
1770 return ie_result
1771
1772 @_handle_extraction_exceptions
1773 def __process_iterable_entry(self, entry, download, extra_info):
1774 return self.process_ie_result(
1775 entry, download=download, extra_info=extra_info)
1776
1777 def _build_format_filter(self, filter_spec):
1778 " Returns a function to filter the formats according to the filter_spec "
1779
1780 OPERATORS = {
1781 '<': operator.lt,
1782 '<=': operator.le,
1783 '>': operator.gt,
1784 '>=': operator.ge,
1785 '=': operator.eq,
1786 '!=': operator.ne,
1787 }
1788 operator_rex = re.compile(r'''(?x)\s*
1789 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1790 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1791 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1792 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1793 m = operator_rex.fullmatch(filter_spec)
1794 if m:
1795 try:
1796 comparison_value = int(m.group('value'))
1797 except ValueError:
1798 comparison_value = parse_filesize(m.group('value'))
1799 if comparison_value is None:
1800 comparison_value = parse_filesize(m.group('value') + 'B')
1801 if comparison_value is None:
1802 raise ValueError(
1803 'Invalid value %r in format specification %r' % (
1804 m.group('value'), filter_spec))
1805 op = OPERATORS[m.group('op')]
1806
1807 if not m:
1808 STR_OPERATORS = {
1809 '=': operator.eq,
1810 '^=': lambda attr, value: attr.startswith(value),
1811 '$=': lambda attr, value: attr.endswith(value),
1812 '*=': lambda attr, value: value in attr,
1813 '~=': lambda attr, value: value.search(attr) is not None
1814 }
1815 str_operator_rex = re.compile(r'''(?x)\s*
1816 (?P<key>[a-zA-Z0-9._-]+)\s*
1817 (?P<negation>!\s*)?(?P<op>%s)\s*(?P<none_inclusive>\?\s*)?
1818 (?P<quote>["'])?
1819 (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+))
1820 (?(quote)(?P=quote))\s*
1821 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1822 m = str_operator_rex.fullmatch(filter_spec)
1823 if m:
1824 if m.group('op') == '~=':
1825 comparison_value = re.compile(m.group('value'))
1826 else:
1827 comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value'))
1828 str_op = STR_OPERATORS[m.group('op')]
1829 if m.group('negation'):
1830 op = lambda attr, value: not str_op(attr, value)
1831 else:
1832 op = str_op
1833
1834 if not m:
1835 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1836
1837 def _filter(f):
1838 actual_value = f.get(m.group('key'))
1839 if actual_value is None:
1840 return m.group('none_inclusive')
1841 return op(actual_value, comparison_value)
1842 return _filter
1843
1844 def _check_formats(self, formats):
1845 for f in formats:
1846 self.to_screen('[info] Testing format %s' % f['format_id'])
1847 path = self.get_output_path('temp')
1848 if not self._ensure_dir_exists(f'{path}/'):
1849 continue
1850 temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
1851 temp_file.close()
1852 try:
1853 success, _ = self.dl(temp_file.name, f, test=True)
1854 except (DownloadError, OSError, ValueError) + network_exceptions:
1855 success = False
1856 finally:
1857 if os.path.exists(temp_file.name):
1858 try:
1859 os.remove(temp_file.name)
1860 except OSError:
1861 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1862 if success:
1863 yield f
1864 else:
1865 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1866
1867 def _default_format_spec(self, info_dict, download=True):
1868
1869 def can_merge():
1870 merger = FFmpegMergerPP(self)
1871 return merger.available and merger.can_merge()
1872
1873 prefer_best = (
1874 not self.params.get('simulate')
1875 and download
1876 and (
1877 not can_merge()
1878 or info_dict.get('is_live') and not self.params.get('live_from_start')
1879 or self.params['outtmpl']['default'] == '-'))
1880 compat = (
1881 prefer_best
1882 or self.params.get('allow_multiple_audio_streams', False)
1883 or 'format-spec' in self.params['compat_opts'])
1884
1885 return (
1886 'best/bestvideo+bestaudio' if prefer_best
1887 else 'bestvideo*+bestaudio/best' if not compat
1888 else 'bestvideo+bestaudio/best')
1889
1890 def build_format_selector(self, format_spec):
1891 def syntax_error(note, start):
1892 message = (
1893 'Invalid format specification: '
1894 '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1]))
1895 return SyntaxError(message)
1896
1897 PICKFIRST = 'PICKFIRST'
1898 MERGE = 'MERGE'
1899 SINGLE = 'SINGLE'
1900 GROUP = 'GROUP'
1901 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1902
1903 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1904 'video': self.params.get('allow_multiple_video_streams', False)}
1905
1906 check_formats = self.params.get('check_formats') == 'selected'
1907
1908 def _parse_filter(tokens):
1909 filter_parts = []
1910 for type, string, start, _, _ in tokens:
1911 if type == tokenize.OP and string == ']':
1912 return ''.join(filter_parts)
1913 else:
1914 filter_parts.append(string)
1915
1916 def _remove_unused_ops(tokens):
1917 # Remove operators that we don't use and join them with the surrounding strings
1918 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1919 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1920 last_string, last_start, last_end, last_line = None, None, None, None
1921 for type, string, start, end, line in tokens:
1922 if type == tokenize.OP and string == '[':
1923 if last_string:
1924 yield tokenize.NAME, last_string, last_start, last_end, last_line
1925 last_string = None
1926 yield type, string, start, end, line
1927 # everything inside brackets will be handled by _parse_filter
1928 for type, string, start, end, line in tokens:
1929 yield type, string, start, end, line
1930 if type == tokenize.OP and string == ']':
1931 break
1932 elif type == tokenize.OP and string in ALLOWED_OPS:
1933 if last_string:
1934 yield tokenize.NAME, last_string, last_start, last_end, last_line
1935 last_string = None
1936 yield type, string, start, end, line
1937 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1938 if not last_string:
1939 last_string = string
1940 last_start = start
1941 last_end = end
1942 else:
1943 last_string += string
1944 if last_string:
1945 yield tokenize.NAME, last_string, last_start, last_end, last_line
1946
1947 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1948 selectors = []
1949 current_selector = None
1950 for type, string, start, _, _ in tokens:
1951 # ENCODING is only defined in python 3.x
1952 if type == getattr(tokenize, 'ENCODING', None):
1953 continue
1954 elif type in [tokenize.NAME, tokenize.NUMBER]:
1955 current_selector = FormatSelector(SINGLE, string, [])
1956 elif type == tokenize.OP:
1957 if string == ')':
1958 if not inside_group:
1959 # ')' will be handled by the parentheses group
1960 tokens.restore_last_token()
1961 break
1962 elif inside_merge and string in ['/', ',']:
1963 tokens.restore_last_token()
1964 break
1965 elif inside_choice and string == ',':
1966 tokens.restore_last_token()
1967 break
1968 elif string == ',':
1969 if not current_selector:
1970 raise syntax_error('"," must follow a format selector', start)
1971 selectors.append(current_selector)
1972 current_selector = None
1973 elif string == '/':
1974 if not current_selector:
1975 raise syntax_error('"/" must follow a format selector', start)
1976 first_choice = current_selector
1977 second_choice = _parse_format_selection(tokens, inside_choice=True)
1978 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1979 elif string == '[':
1980 if not current_selector:
1981 current_selector = FormatSelector(SINGLE, 'best', [])
1982 format_filter = _parse_filter(tokens)
1983 current_selector.filters.append(format_filter)
1984 elif string == '(':
1985 if current_selector:
1986 raise syntax_error('Unexpected "("', start)
1987 group = _parse_format_selection(tokens, inside_group=True)
1988 current_selector = FormatSelector(GROUP, group, [])
1989 elif string == '+':
1990 if not current_selector:
1991 raise syntax_error('Unexpected "+"', start)
1992 selector_1 = current_selector
1993 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1994 if not selector_2:
1995 raise syntax_error('Expected a selector', start)
1996 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1997 else:
1998 raise syntax_error(f'Operator not recognized: "{string}"', start)
1999 elif type == tokenize.ENDMARKER:
2000 break
2001 if current_selector:
2002 selectors.append(current_selector)
2003 return selectors
2004
2005 def _merge(formats_pair):
2006 format_1, format_2 = formats_pair
2007
2008 formats_info = []
2009 formats_info.extend(format_1.get('requested_formats', (format_1,)))
2010 formats_info.extend(format_2.get('requested_formats', (format_2,)))
2011
2012 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
2013 get_no_more = {'video': False, 'audio': False}
2014 for (i, fmt_info) in enumerate(formats_info):
2015 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
2016 formats_info.pop(i)
2017 continue
2018 for aud_vid in ['audio', 'video']:
2019 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
2020 if get_no_more[aud_vid]:
2021 formats_info.pop(i)
2022 break
2023 get_no_more[aud_vid] = True
2024
2025 if len(formats_info) == 1:
2026 return formats_info[0]
2027
2028 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
2029 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
2030
2031 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
2032 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
2033
2034 output_ext = self.params.get('merge_output_format')
2035 if not output_ext:
2036 if the_only_video:
2037 output_ext = the_only_video['ext']
2038 elif the_only_audio and not video_fmts:
2039 output_ext = the_only_audio['ext']
2040 else:
2041 output_ext = 'mkv'
2042
2043 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
2044
2045 new_dict = {
2046 'requested_formats': formats_info,
2047 'format': '+'.join(filtered('format')),
2048 'format_id': '+'.join(filtered('format_id')),
2049 'ext': output_ext,
2050 'protocol': '+'.join(map(determine_protocol, formats_info)),
2051 'language': '+'.join(orderedSet(filtered('language'))) or None,
2052 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
2053 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
2054 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
2055 }
2056
2057 if the_only_video:
2058 new_dict.update({
2059 'width': the_only_video.get('width'),
2060 'height': the_only_video.get('height'),
2061 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
2062 'fps': the_only_video.get('fps'),
2063 'dynamic_range': the_only_video.get('dynamic_range'),
2064 'vcodec': the_only_video.get('vcodec'),
2065 'vbr': the_only_video.get('vbr'),
2066 'stretched_ratio': the_only_video.get('stretched_ratio'),
2067 })
2068
2069 if the_only_audio:
2070 new_dict.update({
2071 'acodec': the_only_audio.get('acodec'),
2072 'abr': the_only_audio.get('abr'),
2073 'asr': the_only_audio.get('asr'),
2074 })
2075
2076 return new_dict
2077
2078 def _check_formats(formats):
2079 if not check_formats:
2080 yield from formats
2081 return
2082 yield from self._check_formats(formats)
2083
2084 def _build_selector_function(selector):
2085 if isinstance(selector, list): # ,
2086 fs = [_build_selector_function(s) for s in selector]
2087
2088 def selector_function(ctx):
2089 for f in fs:
2090 yield from f(ctx)
2091 return selector_function
2092
2093 elif selector.type == GROUP: # ()
2094 selector_function = _build_selector_function(selector.selector)
2095
2096 elif selector.type == PICKFIRST: # /
2097 fs = [_build_selector_function(s) for s in selector.selector]
2098
2099 def selector_function(ctx):
2100 for f in fs:
2101 picked_formats = list(f(ctx))
2102 if picked_formats:
2103 return picked_formats
2104 return []
2105
2106 elif selector.type == MERGE: # +
2107 selector_1, selector_2 = map(_build_selector_function, selector.selector)
2108
2109 def selector_function(ctx):
2110 for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
2111 yield _merge(pair)
2112
2113 elif selector.type == SINGLE: # atom
2114 format_spec = selector.selector or 'best'
2115
2116 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2117 if format_spec == 'all':
2118 def selector_function(ctx):
2119 yield from _check_formats(ctx['formats'][::-1])
2120 elif format_spec == 'mergeall':
2121 def selector_function(ctx):
2122 formats = list(_check_formats(
2123 f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none'))
2124 if not formats:
2125 return
2126 merged_format = formats[-1]
2127 for f in formats[-2::-1]:
2128 merged_format = _merge((merged_format, f))
2129 yield merged_format
2130
2131 else:
2132 format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
2133 mobj = re.match(
2134 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2135 format_spec)
2136 if mobj is not None:
2137 format_idx = int_or_none(mobj.group('n'), default=1)
2138 format_reverse = mobj.group('bw')[0] == 'b'
2139 format_type = (mobj.group('type') or [None])[0]
2140 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2141 format_modified = mobj.group('mod') is not None
2142
2143 format_fallback = not format_type and not format_modified # for b, w
2144 _filter_f = (
2145 (lambda f: f.get('%scodec' % format_type) != 'none')
2146 if format_type and format_modified # bv*, ba*, wv*, wa*
2147 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
2148 if format_type # bv, ba, wv, wa
2149 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2150 if not format_modified # b, w
2151 else lambda f: True) # b*, w*
2152 filter_f = lambda f: _filter_f(f) and (
2153 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2154 else:
2155 if format_spec in self._format_selection_exts['audio']:
2156 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2157 elif format_spec in self._format_selection_exts['video']:
2158 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2159 seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
2160 elif format_spec in self._format_selection_exts['storyboards']:
2161 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2162 else:
2163 filter_f = lambda f: f.get('format_id') == format_spec # id
2164
2165 def selector_function(ctx):
2166 formats = list(ctx['formats'])
2167 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2168 if not matches:
2169 if format_fallback and ctx['incomplete_formats']:
2170 # for extractors with incomplete formats (audio only (soundcloud)
2171 # or video only (imgur)) best/worst will fallback to
2172 # best/worst {video,audio}-only format
2173 matches = formats
2174 elif seperate_fallback and not ctx['has_merged_format']:
2175 # for compatibility with youtube-dl when there is no pre-merged format
2176 matches = list(filter(seperate_fallback, formats))
2177 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2178 try:
2179 yield matches[format_idx - 1]
2180 except LazyList.IndexError:
2181 return
2182
2183 filters = [self._build_format_filter(f) for f in selector.filters]
2184
2185 def final_selector(ctx):
2186 ctx_copy = dict(ctx)
2187 for _filter in filters:
2188 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2189 return selector_function(ctx_copy)
2190 return final_selector
2191
2192 stream = io.BytesIO(format_spec.encode())
2193 try:
2194 tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
2195 except tokenize.TokenError:
2196 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2197
2198 class TokenIterator:
2199 def __init__(self, tokens):
2200 self.tokens = tokens
2201 self.counter = 0
2202
2203 def __iter__(self):
2204 return self
2205
2206 def __next__(self):
2207 if self.counter >= len(self.tokens):
2208 raise StopIteration()
2209 value = self.tokens[self.counter]
2210 self.counter += 1
2211 return value
2212
2213 next = __next__
2214
2215 def restore_last_token(self):
2216 self.counter -= 1
2217
2218 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2219 return _build_selector_function(parsed_selector)
2220
2221 def _calc_headers(self, info_dict):
2222 res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
2223
2224 cookies = self._calc_cookies(info_dict['url'])
2225 if cookies:
2226 res['Cookie'] = cookies
2227
2228 if 'X-Forwarded-For' not in res:
2229 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2230 if x_forwarded_for_ip:
2231 res['X-Forwarded-For'] = x_forwarded_for_ip
2232
2233 return res
2234
2235 def _calc_cookies(self, url):
2236 pr = sanitized_Request(url)
2237 self.cookiejar.add_cookie_header(pr)
2238 return pr.get_header('Cookie')
2239
2240 def _sort_thumbnails(self, thumbnails):
2241 thumbnails.sort(key=lambda t: (
2242 t.get('preference') if t.get('preference') is not None else -1,
2243 t.get('width') if t.get('width') is not None else -1,
2244 t.get('height') if t.get('height') is not None else -1,
2245 t.get('id') if t.get('id') is not None else '',
2246 t.get('url')))
2247
2248 def _sanitize_thumbnails(self, info_dict):
2249 thumbnails = info_dict.get('thumbnails')
2250 if thumbnails is None:
2251 thumbnail = info_dict.get('thumbnail')
2252 if thumbnail:
2253 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2254 if not thumbnails:
2255 return
2256
2257 def check_thumbnails(thumbnails):
2258 for t in thumbnails:
2259 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2260 try:
2261 self.urlopen(HEADRequest(t['url']))
2262 except network_exceptions as err:
2263 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2264 continue
2265 yield t
2266
2267 self._sort_thumbnails(thumbnails)
2268 for i, t in enumerate(thumbnails):
2269 if t.get('id') is None:
2270 t['id'] = '%d' % i
2271 if t.get('width') and t.get('height'):
2272 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2273 t['url'] = sanitize_url(t['url'])
2274
2275 if self.params.get('check_formats') is True:
2276 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
2277 else:
2278 info_dict['thumbnails'] = thumbnails
2279
2280 def _fill_common_fields(self, info_dict, is_video=True):
2281 # TODO: move sanitization here
2282 if is_video:
2283 # playlists are allowed to lack "title"
2284 title = info_dict.get('title', NO_DEFAULT)
2285 if title is NO_DEFAULT:
2286 raise ExtractorError('Missing "title" field in extractor result',
2287 video_id=info_dict['id'], ie=info_dict['extractor'])
2288 info_dict['fulltitle'] = title
2289 if not title:
2290 if title == '':
2291 self.write_debug('Extractor gave empty title. Creating a generic title')
2292 else:
2293 self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
2294 info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}'
2295
2296 if info_dict.get('duration') is not None:
2297 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2298
2299 for ts_key, date_key in (
2300 ('timestamp', 'upload_date'),
2301 ('release_timestamp', 'release_date'),
2302 ('modified_timestamp', 'modified_date'),
2303 ):
2304 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2305 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2306 # see http://bugs.python.org/issue1646728)
2307 with contextlib.suppress(ValueError, OverflowError, OSError):
2308 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2309 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2310
2311 live_keys = ('is_live', 'was_live')
2312 live_status = info_dict.get('live_status')
2313 if live_status is None:
2314 for key in live_keys:
2315 if info_dict.get(key) is False:
2316 continue
2317 if info_dict.get(key):
2318 live_status = key
2319 break
2320 if all(info_dict.get(key) is False for key in live_keys):
2321 live_status = 'not_live'
2322 if live_status:
2323 info_dict['live_status'] = live_status
2324 for key in live_keys:
2325 if info_dict.get(key) is None:
2326 info_dict[key] = (live_status == key)
2327
2328 # Auto generate title fields corresponding to the *_number fields when missing
2329 # in order to always have clean titles. This is very common for TV series.
2330 for field in ('chapter', 'season', 'episode'):
2331 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2332 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2333
2334 def _raise_pending_errors(self, info):
2335 err = info.pop('__pending_error', None)
2336 if err:
2337 self.report_error(err, tb=False)
2338
2339 def process_video_result(self, info_dict, download=True):
2340 assert info_dict.get('_type', 'video') == 'video'
2341 self._num_videos += 1
2342
2343 if 'id' not in info_dict:
2344 raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
2345 elif not info_dict.get('id'):
2346 raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
2347
2348 def report_force_conversion(field, field_not, conversion):
2349 self.report_warning(
2350 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2351 % (field, field_not, conversion))
2352
2353 def sanitize_string_field(info, string_field):
2354 field = info.get(string_field)
2355 if field is None or isinstance(field, compat_str):
2356 return
2357 report_force_conversion(string_field, 'a string', 'string')
2358 info[string_field] = compat_str(field)
2359
2360 def sanitize_numeric_fields(info):
2361 for numeric_field in self._NUMERIC_FIELDS:
2362 field = info.get(numeric_field)
2363 if field is None or isinstance(field, (int, float)):
2364 continue
2365 report_force_conversion(numeric_field, 'numeric', 'int')
2366 info[numeric_field] = int_or_none(field)
2367
2368 sanitize_string_field(info_dict, 'id')
2369 sanitize_numeric_fields(info_dict)
2370 if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
2371 self.report_warning('"duration" field is negative, there is an error in extractor')
2372
2373 if 'playlist' not in info_dict:
2374 # It isn't part of a playlist
2375 info_dict['playlist'] = None
2376 info_dict['playlist_index'] = None
2377
2378 self._sanitize_thumbnails(info_dict)
2379
2380 thumbnail = info_dict.get('thumbnail')
2381 thumbnails = info_dict.get('thumbnails')
2382 if thumbnail:
2383 info_dict['thumbnail'] = sanitize_url(thumbnail)
2384 elif thumbnails:
2385 info_dict['thumbnail'] = thumbnails[-1]['url']
2386
2387 if info_dict.get('display_id') is None and 'id' in info_dict:
2388 info_dict['display_id'] = info_dict['id']
2389
2390 self._fill_common_fields(info_dict)
2391
2392 for cc_kind in ('subtitles', 'automatic_captions'):
2393 cc = info_dict.get(cc_kind)
2394 if cc:
2395 for _, subtitle in cc.items():
2396 for subtitle_format in subtitle:
2397 if subtitle_format.get('url'):
2398 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2399 if subtitle_format.get('ext') is None:
2400 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2401
2402 automatic_captions = info_dict.get('automatic_captions')
2403 subtitles = info_dict.get('subtitles')
2404
2405 info_dict['requested_subtitles'] = self.process_subtitles(
2406 info_dict['id'], subtitles, automatic_captions)
2407
2408 if info_dict.get('formats') is None:
2409 # There's only one format available
2410 formats = [info_dict]
2411 else:
2412 formats = info_dict['formats']
2413
2414 # or None ensures --clean-infojson removes it
2415 info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
2416 if not self.params.get('allow_unplayable_formats'):
2417 formats = [f for f in formats if not f.get('has_drm')]
2418 if info_dict['_has_drm'] and all(
2419 f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
2420 self.report_warning(
2421 'This video is DRM protected and only images are available for download. '
2422 'Use --list-formats to see them')
2423
2424 get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
2425 if not get_from_start:
2426 info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
2427 if info_dict.get('is_live') and formats:
2428 formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
2429 if get_from_start and not formats:
2430 self.raise_no_formats(info_dict, msg=(
2431 '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
2432 'If you want to download from the current time, use --no-live-from-start'))
2433
2434 if not formats:
2435 self.raise_no_formats(info_dict)
2436
2437 def is_wellformed(f):
2438 url = f.get('url')
2439 if not url:
2440 self.report_warning(
2441 '"url" field is missing or empty - skipping format, '
2442 'there is an error in extractor')
2443 return False
2444 if isinstance(url, bytes):
2445 sanitize_string_field(f, 'url')
2446 return True
2447
2448 # Filter out malformed formats for better extraction robustness
2449 formats = list(filter(is_wellformed, formats))
2450
2451 formats_dict = {}
2452
2453 # We check that all the formats have the format and format_id fields
2454 for i, format in enumerate(formats):
2455 sanitize_string_field(format, 'format_id')
2456 sanitize_numeric_fields(format)
2457 format['url'] = sanitize_url(format['url'])
2458 if not format.get('format_id'):
2459 format['format_id'] = compat_str(i)
2460 else:
2461 # Sanitize format_id from characters used in format selector expression
2462 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2463 format_id = format['format_id']
2464 if format_id not in formats_dict:
2465 formats_dict[format_id] = []
2466 formats_dict[format_id].append(format)
2467
2468 # Make sure all formats have unique format_id
2469 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2470 for format_id, ambiguous_formats in formats_dict.items():
2471 ambigious_id = len(ambiguous_formats) > 1
2472 for i, format in enumerate(ambiguous_formats):
2473 if ambigious_id:
2474 format['format_id'] = '%s-%d' % (format_id, i)
2475 if format.get('ext') is None:
2476 format['ext'] = determine_ext(format['url']).lower()
2477 # Ensure there is no conflict between id and ext in format selection
2478 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2479 if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
2480 format['format_id'] = 'f%s' % format['format_id']
2481
2482 for i, format in enumerate(formats):
2483 if format.get('format') is None:
2484 format['format'] = '{id} - {res}{note}'.format(
2485 id=format['format_id'],
2486 res=self.format_resolution(format),
2487 note=format_field(format, 'format_note', ' (%s)'),
2488 )
2489 if format.get('protocol') is None:
2490 format['protocol'] = determine_protocol(format)
2491 if format.get('resolution') is None:
2492 format['resolution'] = self.format_resolution(format, default=None)
2493 if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
2494 format['dynamic_range'] = 'SDR'
2495 if (info_dict.get('duration') and format.get('tbr')
2496 and not format.get('filesize') and not format.get('filesize_approx')):
2497 format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
2498
2499 # Add HTTP headers, so that external programs can use them from the
2500 # json output
2501 full_format_info = info_dict.copy()
2502 full_format_info.update(format)
2503 format['http_headers'] = self._calc_headers(full_format_info)
2504 # Remove private housekeeping stuff
2505 if '__x_forwarded_for_ip' in info_dict:
2506 del info_dict['__x_forwarded_for_ip']
2507
2508 if self.params.get('check_formats') is True:
2509 formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
2510
2511 if not formats or formats[0] is not info_dict:
2512 # only set the 'formats' fields if the original info_dict list them
2513 # otherwise we end up with a circular reference, the first (and unique)
2514 # element in the 'formats' field in info_dict is info_dict itself,
2515 # which can't be exported to json
2516 info_dict['formats'] = formats
2517
2518 info_dict, _ = self.pre_process(info_dict)
2519
2520 if self._match_entry(info_dict, incomplete=self._format_fields) is not None:
2521 return info_dict
2522
2523 self.post_extract(info_dict)
2524 info_dict, _ = self.pre_process(info_dict, 'after_filter')
2525
2526 # The pre-processors may have modified the formats
2527 formats = info_dict.get('formats', [info_dict])
2528
2529 list_only = self.params.get('simulate') is None and (
2530 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2531 interactive_format_selection = not list_only and self.format_selector == '-'
2532 if self.params.get('list_thumbnails'):
2533 self.list_thumbnails(info_dict)
2534 if self.params.get('listsubtitles'):
2535 if 'automatic_captions' in info_dict:
2536 self.list_subtitles(
2537 info_dict['id'], automatic_captions, 'automatic captions')
2538 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2539 if self.params.get('listformats') or interactive_format_selection:
2540 self.list_formats(info_dict)
2541 if list_only:
2542 # Without this printing, -F --print-json will not work
2543 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2544 return info_dict
2545
2546 format_selector = self.format_selector
2547 if format_selector is None:
2548 req_format = self._default_format_spec(info_dict, download=download)
2549 self.write_debug('Default format spec: %s' % req_format)
2550 format_selector = self.build_format_selector(req_format)
2551
2552 while True:
2553 if interactive_format_selection:
2554 req_format = input(
2555 self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS))
2556 try:
2557 format_selector = self.build_format_selector(req_format)
2558 except SyntaxError as err:
2559 self.report_error(err, tb=False, is_error=False)
2560 continue
2561
2562 formats_to_download = list(format_selector({
2563 'formats': formats,
2564 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
2565 'incomplete_formats': (
2566 # All formats are video-only or
2567 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2568 # all formats are audio-only
2569 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)),
2570 }))
2571 if interactive_format_selection and not formats_to_download:
2572 self.report_error('Requested format is not available', tb=False, is_error=False)
2573 continue
2574 break
2575
2576 if not formats_to_download:
2577 if not self.params.get('ignore_no_formats_error'):
2578 raise ExtractorError(
2579 'Requested format is not available. Use --list-formats for a list of available formats',
2580 expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
2581 self.report_warning('Requested format is not available')
2582 # Process what we can, even without any available formats.
2583 formats_to_download = [{}]
2584
2585 requested_ranges = self.params.get('download_ranges')
2586 if requested_ranges:
2587 requested_ranges = tuple(requested_ranges(info_dict, self))
2588
2589 best_format, downloaded_formats = formats_to_download[-1], []
2590 if download:
2591 if best_format:
2592 def to_screen(*msg):
2593 self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
2594
2595 to_screen(f'Downloading {len(formats_to_download)} format(s):',
2596 (f['format_id'] for f in formats_to_download))
2597 if requested_ranges:
2598 to_screen(f'Downloading {len(requested_ranges)} time ranges:',
2599 (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges))
2600 max_downloads_reached = False
2601
2602 for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]):
2603 new_info = self._copy_infodict(info_dict)
2604 new_info.update(fmt)
2605 if chapter:
2606 new_info.update({
2607 'section_start': chapter.get('start_time'),
2608 'section_end': chapter.get('end_time', 0),
2609 'section_title': chapter.get('title'),
2610 'section_number': chapter.get('index'),
2611 })
2612 downloaded_formats.append(new_info)
2613 try:
2614 self.process_info(new_info)
2615 except MaxDownloadsReached:
2616 max_downloads_reached = True
2617 self._raise_pending_errors(new_info)
2618 # Remove copied info
2619 for key, val in tuple(new_info.items()):
2620 if info_dict.get(key) == val:
2621 new_info.pop(key)
2622 if max_downloads_reached:
2623 break
2624
2625 write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats}
2626 assert write_archive.issubset({True, False, 'ignore'})
2627 if True in write_archive and False not in write_archive:
2628 self.record_download_archive(info_dict)
2629
2630 info_dict['requested_downloads'] = downloaded_formats
2631 info_dict = self.run_all_pps('after_video', info_dict)
2632 if max_downloads_reached:
2633 raise MaxDownloadsReached()
2634
2635 # We update the info dict with the selected best quality format (backwards compatibility)
2636 info_dict.update(best_format)
2637 return info_dict
2638
2639 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2640 """Select the requested subtitles and their format"""
2641 available_subs, normal_sub_langs = {}, []
2642 if normal_subtitles and self.params.get('writesubtitles'):
2643 available_subs.update(normal_subtitles)
2644 normal_sub_langs = tuple(normal_subtitles.keys())
2645 if automatic_captions and self.params.get('writeautomaticsub'):
2646 for lang, cap_info in automatic_captions.items():
2647 if lang not in available_subs:
2648 available_subs[lang] = cap_info
2649
2650 if (not self.params.get('writesubtitles') and not
2651 self.params.get('writeautomaticsub') or not
2652 available_subs):
2653 return None
2654
2655 all_sub_langs = tuple(available_subs.keys())
2656 if self.params.get('allsubtitles', False):
2657 requested_langs = all_sub_langs
2658 elif self.params.get('subtitleslangs', False):
2659 # A list is used so that the order of languages will be the same as
2660 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2661 requested_langs = []
2662 for lang_re in self.params.get('subtitleslangs'):
2663 discard = lang_re[0] == '-'
2664 if discard:
2665 lang_re = lang_re[1:]
2666 if lang_re == 'all':
2667 if discard:
2668 requested_langs = []
2669 else:
2670 requested_langs.extend(all_sub_langs)
2671 continue
2672 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2673 if discard:
2674 for lang in current_langs:
2675 while lang in requested_langs:
2676 requested_langs.remove(lang)
2677 else:
2678 requested_langs.extend(current_langs)
2679 requested_langs = orderedSet(requested_langs)
2680 elif normal_sub_langs:
2681 requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
2682 else:
2683 requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1]
2684 if requested_langs:
2685 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2686
2687 formats_query = self.params.get('subtitlesformat', 'best')
2688 formats_preference = formats_query.split('/') if formats_query else []
2689 subs = {}
2690 for lang in requested_langs:
2691 formats = available_subs.get(lang)
2692 if formats is None:
2693 self.report_warning(f'{lang} subtitles not available for {video_id}')
2694 continue
2695 for ext in formats_preference:
2696 if ext == 'best':
2697 f = formats[-1]
2698 break
2699 matches = list(filter(lambda f: f['ext'] == ext, formats))
2700 if matches:
2701 f = matches[-1]
2702 break
2703 else:
2704 f = formats[-1]
2705 self.report_warning(
2706 'No subtitle format found matching "%s" for language %s, '
2707 'using %s' % (formats_query, lang, f['ext']))
2708 subs[lang] = f
2709 return subs
2710
2711 def _forceprint(self, key, info_dict):
2712 if info_dict is None:
2713 return
2714 info_copy = info_dict.copy()
2715 info_copy['formats_table'] = self.render_formats_table(info_dict)
2716 info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
2717 info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
2718 info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
2719
2720 def format_tmpl(tmpl):
2721 mobj = re.match(r'\w+(=?)$', tmpl)
2722 if mobj and mobj.group(1):
2723 return f'{tmpl[:-1]} = %({tmpl[:-1]})r'
2724 elif mobj:
2725 return f'%({tmpl})s'
2726 return tmpl
2727
2728 for tmpl in self.params['forceprint'].get(key, []):
2729 self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
2730
2731 for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
2732 filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
2733 tmpl = format_tmpl(tmpl)
2734 self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
2735 if self._ensure_dir_exists(filename):
2736 with open(filename, 'a', encoding='utf-8') as f:
2737 f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
2738
2739 def __forced_printings(self, info_dict, filename, incomplete):
2740 def print_mandatory(field, actual_field=None):
2741 if actual_field is None:
2742 actual_field = field
2743 if (self.params.get('force%s' % field, False)
2744 and (not incomplete or info_dict.get(actual_field) is not None)):
2745 self.to_stdout(info_dict[actual_field])
2746
2747 def print_optional(field):
2748 if (self.params.get('force%s' % field, False)
2749 and info_dict.get(field) is not None):
2750 self.to_stdout(info_dict[field])
2751
2752 info_dict = info_dict.copy()
2753 if filename is not None:
2754 info_dict['filename'] = filename
2755 if info_dict.get('requested_formats') is not None:
2756 # For RTMP URLs, also include the playpath
2757 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2758 elif info_dict.get('url'):
2759 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2760
2761 if (self.params.get('forcejson')
2762 or self.params['forceprint'].get('video')
2763 or self.params['print_to_file'].get('video')):
2764 self.post_extract(info_dict)
2765 self._forceprint('video', info_dict)
2766
2767 print_mandatory('title')
2768 print_mandatory('id')
2769 print_mandatory('url', 'urls')
2770 print_optional('thumbnail')
2771 print_optional('description')
2772 print_optional('filename')
2773 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2774 self.to_stdout(formatSeconds(info_dict['duration']))
2775 print_mandatory('format')
2776
2777 if self.params.get('forcejson'):
2778 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2779
2780 def dl(self, name, info, subtitle=False, test=False):
2781 if not info.get('url'):
2782 self.raise_no_formats(info, True)
2783
2784 if test:
2785 verbose = self.params.get('verbose')
2786 params = {
2787 'test': True,
2788 'quiet': self.params.get('quiet') or not verbose,
2789 'verbose': verbose,
2790 'noprogress': not verbose,
2791 'nopart': True,
2792 'skip_unavailable_fragments': False,
2793 'keep_fragments': False,
2794 'overwrites': True,
2795 '_no_ytdl_file': True,
2796 }
2797 else:
2798 params = self.params
2799 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2800 if not test:
2801 for ph in self._progress_hooks:
2802 fd.add_progress_hook(ph)
2803 urls = '", "'.join(
2804 (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
2805 for f in info.get('requested_formats', []) or [info])
2806 self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"')
2807
2808 # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
2809 # But it may contain objects that are not deep-copyable
2810 new_info = self._copy_infodict(info)
2811 if new_info.get('http_headers') is None:
2812 new_info['http_headers'] = self._calc_headers(new_info)
2813 return fd.download(name, new_info, subtitle)
2814
2815 def existing_file(self, filepaths, *, default_overwrite=True):
2816 existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
2817 if existing_files and not self.params.get('overwrites', default_overwrite):
2818 return existing_files[0]
2819
2820 for file in existing_files:
2821 self.report_file_delete(file)
2822 os.remove(file)
2823 return None
2824
2825 def process_info(self, info_dict):
2826 """Process a single resolved IE result. (Modifies it in-place)"""
2827
2828 assert info_dict.get('_type', 'video') == 'video'
2829 original_infodict = info_dict
2830
2831 if 'format' not in info_dict and 'ext' in info_dict:
2832 info_dict['format'] = info_dict['ext']
2833
2834 # This is mostly just for backward compatibility of process_info
2835 # As a side-effect, this allows for format-specific filters
2836 if self._match_entry(info_dict) is not None:
2837 info_dict['__write_download_archive'] = 'ignore'
2838 return
2839
2840 # Does nothing under normal operation - for backward compatibility of process_info
2841 self.post_extract(info_dict)
2842 self._num_downloads += 1
2843
2844 # info_dict['_filename'] needs to be set for backward compatibility
2845 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2846 temp_filename = self.prepare_filename(info_dict, 'temp')
2847 files_to_move = {}
2848
2849 # Forced printings
2850 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2851
2852 def check_max_downloads():
2853 if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'):
2854 raise MaxDownloadsReached()
2855
2856 if self.params.get('simulate'):
2857 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
2858 check_max_downloads()
2859 return
2860
2861 if full_filename is None:
2862 return
2863 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2864 return
2865 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2866 return
2867
2868 if self._write_description('video', info_dict,
2869 self.prepare_filename(info_dict, 'description')) is None:
2870 return
2871
2872 sub_files = self._write_subtitles(info_dict, temp_filename)
2873 if sub_files is None:
2874 return
2875 files_to_move.update(dict(sub_files))
2876
2877 thumb_files = self._write_thumbnails(
2878 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2879 if thumb_files is None:
2880 return
2881 files_to_move.update(dict(thumb_files))
2882
2883 infofn = self.prepare_filename(info_dict, 'infojson')
2884 _infojson_written = self._write_info_json('video', info_dict, infofn)
2885 if _infojson_written:
2886 info_dict['infojson_filename'] = infofn
2887 # For backward compatibility, even though it was a private field
2888 info_dict['__infojson_filename'] = infofn
2889 elif _infojson_written is None:
2890 return
2891
2892 # Note: Annotations are deprecated
2893 annofn = None
2894 if self.params.get('writeannotations', False):
2895 annofn = self.prepare_filename(info_dict, 'annotation')
2896 if annofn:
2897 if not self._ensure_dir_exists(encodeFilename(annofn)):
2898 return
2899 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2900 self.to_screen('[info] Video annotations are already present')
2901 elif not info_dict.get('annotations'):
2902 self.report_warning('There are no annotations to write.')
2903 else:
2904 try:
2905 self.to_screen('[info] Writing video annotations to: ' + annofn)
2906 with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2907 annofile.write(info_dict['annotations'])
2908 except (KeyError, TypeError):
2909 self.report_warning('There are no annotations to write.')
2910 except OSError:
2911 self.report_error('Cannot write annotations file: ' + annofn)
2912 return
2913
2914 # Write internet shortcut files
2915 def _write_link_file(link_type):
2916 url = try_get(info_dict['webpage_url'], iri_to_uri)
2917 if not url:
2918 self.report_warning(
2919 f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
2920 return True
2921 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
2922 if not self._ensure_dir_exists(encodeFilename(linkfn)):
2923 return False
2924 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2925 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
2926 return True
2927 try:
2928 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
2929 with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
2930 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
2931 template_vars = {'url': url}
2932 if link_type == 'desktop':
2933 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
2934 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
2935 except OSError:
2936 self.report_error(f'Cannot write internet shortcut {linkfn}')
2937 return False
2938 return True
2939
2940 write_links = {
2941 'url': self.params.get('writeurllink'),
2942 'webloc': self.params.get('writewebloclink'),
2943 'desktop': self.params.get('writedesktoplink'),
2944 }
2945 if self.params.get('writelink'):
2946 link_type = ('webloc' if sys.platform == 'darwin'
2947 else 'desktop' if sys.platform.startswith('linux')
2948 else 'url')
2949 write_links[link_type] = True
2950
2951 if any(should_write and not _write_link_file(link_type)
2952 for link_type, should_write in write_links.items()):
2953 return
2954
2955 def replace_info_dict(new_info):
2956 nonlocal info_dict
2957 if new_info == info_dict:
2958 return
2959 info_dict.clear()
2960 info_dict.update(new_info)
2961
2962 new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2963 replace_info_dict(new_info)
2964
2965 if self.params.get('skip_download'):
2966 info_dict['filepath'] = temp_filename
2967 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2968 info_dict['__files_to_move'] = files_to_move
2969 replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
2970 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
2971 else:
2972 # Download
2973 info_dict.setdefault('__postprocessors', [])
2974 try:
2975
2976 def existing_video_file(*filepaths):
2977 ext = info_dict.get('ext')
2978 converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
2979 file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
2980 default_overwrite=False)
2981 if file:
2982 info_dict['ext'] = os.path.splitext(file)[1][1:]
2983 return file
2984
2985 success = True
2986 merger, fd = FFmpegMergerPP(self), None
2987 if info_dict.get('protocol') or info_dict.get('url'):
2988 fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
2989 if fd is not FFmpegFD and (
2990 info_dict.get('section_start') or info_dict.get('section_end')):
2991 msg = ('This format cannot be partially downloaded' if merger.available
2992 else 'You have requested downloading the video partially, but ffmpeg is not installed')
2993 self.report_error(f'{msg}. Aborting')
2994 return
2995
2996 if info_dict.get('requested_formats') is not None:
2997
2998 def compatible_formats(formats):
2999 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
3000 video_formats = [format for format in formats if format.get('vcodec') != 'none']
3001 audio_formats = [format for format in formats if format.get('acodec') != 'none']
3002 if len(video_formats) > 2 or len(audio_formats) > 2:
3003 return False
3004
3005 # Check extension
3006 exts = {format.get('ext') for format in formats}
3007 COMPATIBLE_EXTS = (
3008 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'},
3009 {'webm'},
3010 )
3011 for ext_sets in COMPATIBLE_EXTS:
3012 if ext_sets.issuperset(exts):
3013 return True
3014 # TODO: Check acodec/vcodec
3015 return False
3016
3017 requested_formats = info_dict['requested_formats']
3018 old_ext = info_dict['ext']
3019 if self.params.get('merge_output_format') is None:
3020 if not compatible_formats(requested_formats):
3021 info_dict['ext'] = 'mkv'
3022 self.report_warning(
3023 'Requested formats are incompatible for merge and will be merged into mkv')
3024 if (info_dict['ext'] == 'webm'
3025 and info_dict.get('thumbnails')
3026 # check with type instead of pp_key, __name__, or isinstance
3027 # since we dont want any custom PPs to trigger this
3028 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721
3029 info_dict['ext'] = 'mkv'
3030 self.report_warning(
3031 'webm doesn\'t support embedding a thumbnail, mkv will be used')
3032 new_ext = info_dict['ext']
3033
3034 def correct_ext(filename, ext=new_ext):
3035 if filename == '-':
3036 return filename
3037 filename_real_ext = os.path.splitext(filename)[1][1:]
3038 filename_wo_ext = (
3039 os.path.splitext(filename)[0]
3040 if filename_real_ext in (old_ext, new_ext)
3041 else filename)
3042 return f'{filename_wo_ext}.{ext}'
3043
3044 # Ensure filename always has a correct extension for successful merge
3045 full_filename = correct_ext(full_filename)
3046 temp_filename = correct_ext(temp_filename)
3047 dl_filename = existing_video_file(full_filename, temp_filename)
3048 info_dict['__real_download'] = False
3049
3050 downloaded = []
3051 if dl_filename is not None:
3052 self.report_file_already_downloaded(dl_filename)
3053 elif fd:
3054 for f in requested_formats if fd != FFmpegFD else []:
3055 f['filepath'] = fname = prepend_extension(
3056 correct_ext(temp_filename, info_dict['ext']),
3057 'f%s' % f['format_id'], info_dict['ext'])
3058 downloaded.append(fname)
3059 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
3060 success, real_download = self.dl(temp_filename, info_dict)
3061 info_dict['__real_download'] = real_download
3062 else:
3063 if self.params.get('allow_unplayable_formats'):
3064 self.report_warning(
3065 'You have requested merging of multiple formats '
3066 'while also allowing unplayable formats to be downloaded. '
3067 'The formats won\'t be merged to prevent data corruption.')
3068 elif not merger.available:
3069 msg = 'You have requested merging of multiple formats but ffmpeg is not installed'
3070 if not self.params.get('ignoreerrors'):
3071 self.report_error(f'{msg}. Aborting due to --abort-on-error')
3072 return
3073 self.report_warning(f'{msg}. The formats won\'t be merged')
3074
3075 if temp_filename == '-':
3076 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
3077 else 'but the formats are incompatible for simultaneous download' if merger.available
3078 else 'but ffmpeg is not installed')
3079 self.report_warning(
3080 f'You have requested downloading multiple formats to stdout {reason}. '
3081 'The formats will be streamed one after the other')
3082 fname = temp_filename
3083 for f in requested_formats:
3084 new_info = dict(info_dict)
3085 del new_info['requested_formats']
3086 new_info.update(f)
3087 if temp_filename != '-':
3088 fname = prepend_extension(
3089 correct_ext(temp_filename, new_info['ext']),
3090 'f%s' % f['format_id'], new_info['ext'])
3091 if not self._ensure_dir_exists(fname):
3092 return
3093 f['filepath'] = fname
3094 downloaded.append(fname)
3095 partial_success, real_download = self.dl(fname, new_info)
3096 info_dict['__real_download'] = info_dict['__real_download'] or real_download
3097 success = success and partial_success
3098
3099 if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
3100 info_dict['__postprocessors'].append(merger)
3101 info_dict['__files_to_merge'] = downloaded
3102 # Even if there were no downloads, it is being merged only now
3103 info_dict['__real_download'] = True
3104 else:
3105 for file in downloaded:
3106 files_to_move[file] = None
3107 else:
3108 # Just a single file
3109 dl_filename = existing_video_file(full_filename, temp_filename)
3110 if dl_filename is None or dl_filename == temp_filename:
3111 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
3112 # So we should try to resume the download
3113 success, real_download = self.dl(temp_filename, info_dict)
3114 info_dict['__real_download'] = real_download
3115 else:
3116 self.report_file_already_downloaded(dl_filename)
3117
3118 dl_filename = dl_filename or temp_filename
3119 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
3120
3121 except network_exceptions as err:
3122 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
3123 return
3124 except OSError as err:
3125 raise UnavailableVideoError(err)
3126 except (ContentTooShortError, ) as err:
3127 self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})')
3128 return
3129
3130 self._raise_pending_errors(info_dict)
3131 if success and full_filename != '-':
3132
3133 def fixup():
3134 do_fixup = True
3135 fixup_policy = self.params.get('fixup')
3136 vid = info_dict['id']
3137
3138 if fixup_policy in ('ignore', 'never'):
3139 return
3140 elif fixup_policy == 'warn':
3141 do_fixup = 'warn'
3142 elif fixup_policy != 'force':
3143 assert fixup_policy in ('detect_or_warn', None)
3144 if not info_dict.get('__real_download'):
3145 do_fixup = False
3146
3147 def ffmpeg_fixup(cndn, msg, cls):
3148 if not (do_fixup and cndn):
3149 return
3150 elif do_fixup == 'warn':
3151 self.report_warning(f'{vid}: {msg}')
3152 return
3153 pp = cls(self)
3154 if pp.available:
3155 info_dict['__postprocessors'].append(pp)
3156 else:
3157 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
3158
3159 stretched_ratio = info_dict.get('stretched_ratio')
3160 ffmpeg_fixup(
3161 stretched_ratio not in (1, None),
3162 f'Non-uniform pixel ratio {stretched_ratio}',
3163 FFmpegFixupStretchedPP)
3164
3165 ffmpeg_fixup(
3166 (info_dict.get('requested_formats') is None
3167 and info_dict.get('container') == 'm4a_dash'
3168 and info_dict.get('ext') == 'm4a'),
3169 'writing DASH m4a. Only some players support this container',
3170 FFmpegFixupM4aPP)
3171
3172 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
3173 downloader = downloader.FD_NAME if downloader else None
3174
3175 if info_dict.get('requested_formats') is None: # Not necessary if doing merger
3176 ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')
3177 or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
3178 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
3179 FFmpegFixupM3u8PP)
3180 ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
3181 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
3182
3183 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
3184 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP)
3185
3186 fixup()
3187 try:
3188 replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
3189 except PostProcessingError as err:
3190 self.report_error('Postprocessing: %s' % str(err))
3191 return
3192 try:
3193 for ph in self._post_hooks:
3194 ph(info_dict['filepath'])
3195 except Exception as err:
3196 self.report_error('post hooks: %s' % str(err))
3197 return
3198 info_dict['__write_download_archive'] = True
3199
3200 assert info_dict is original_infodict # Make sure the info_dict was modified in-place
3201 if self.params.get('force_write_download_archive'):
3202 info_dict['__write_download_archive'] = True
3203 check_max_downloads()
3204
3205 def __download_wrapper(self, func):
3206 @functools.wraps(func)
3207 def wrapper(*args, **kwargs):
3208 try:
3209 res = func(*args, **kwargs)
3210 except UnavailableVideoError as e:
3211 self.report_error(e)
3212 except DownloadCancelled as e:
3213 self.to_screen(f'[info] {e}')
3214 if not self.params.get('break_per_url'):
3215 raise
3216 else:
3217 if self.params.get('dump_single_json', False):
3218 self.post_extract(res)
3219 self.to_stdout(json.dumps(self.sanitize_info(res)))
3220 return wrapper
3221
3222 def download(self, url_list):
3223 """Download a given list of URLs."""
3224 url_list = variadic(url_list) # Passing a single URL is a common mistake
3225 outtmpl = self.params['outtmpl']['default']
3226 if (len(url_list) > 1
3227 and outtmpl != '-'
3228 and '%' not in outtmpl
3229 and self.params.get('max_downloads') != 1):
3230 raise SameFileError(outtmpl)
3231
3232 for url in url_list:
3233 self.__download_wrapper(self.extract_info)(
3234 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
3235
3236 return self._download_retcode
3237
3238 def download_with_info_file(self, info_filename):
3239 with contextlib.closing(fileinput.FileInput(
3240 [info_filename], mode='r',
3241 openhook=fileinput.hook_encoded('utf-8'))) as f:
3242 # FileInput doesn't have a read method, we can't call json.load
3243 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
3244 try:
3245 self.__download_wrapper(self.process_ie_result)(info, download=True)
3246 except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
3247 if not isinstance(e, EntryNotInPlaylist):
3248 self.to_stderr('\r')
3249 webpage_url = info.get('webpage_url')
3250 if webpage_url is not None:
3251 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
3252 return self.download([webpage_url])
3253 else:
3254 raise
3255 return self._download_retcode
3256
3257 @staticmethod
3258 def sanitize_info(info_dict, remove_private_keys=False):
3259 ''' Sanitize the infodict for converting to json '''
3260 if info_dict is None:
3261 return info_dict
3262 info_dict.setdefault('epoch', int(time.time()))
3263 info_dict.setdefault('_type', 'video')
3264
3265 if remove_private_keys:
3266 reject = lambda k, v: v is None or k.startswith('__') or k in {
3267 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
3268 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber',
3269 }
3270 else:
3271 reject = lambda k, v: False
3272
3273 def filter_fn(obj):
3274 if isinstance(obj, dict):
3275 return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
3276 elif isinstance(obj, (list, tuple, set, LazyList)):
3277 return list(map(filter_fn, obj))
3278 elif obj is None or isinstance(obj, (str, int, float, bool)):
3279 return obj
3280 else:
3281 return repr(obj)
3282
3283 return filter_fn(info_dict)
3284
3285 @staticmethod
3286 def filter_requested_info(info_dict, actually_filter=True):
3287 ''' Alias of sanitize_info for backward compatibility '''
3288 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3289
3290 def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
3291 for filename in set(filter(None, files_to_delete)):
3292 if msg:
3293 self.to_screen(msg % filename)
3294 try:
3295 os.remove(filename)
3296 except OSError:
3297 self.report_warning(f'Unable to delete file {filename}')
3298 if filename in info.get('__files_to_move', []): # NB: Delete even if None
3299 del info['__files_to_move'][filename]
3300
3301 @staticmethod
3302 def post_extract(info_dict):
3303 def actual_post_extract(info_dict):
3304 if info_dict.get('_type') in ('playlist', 'multi_video'):
3305 for video_dict in info_dict.get('entries', {}):
3306 actual_post_extract(video_dict or {})
3307 return
3308
3309 post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {})
3310 info_dict.update(post_extractor())
3311
3312 actual_post_extract(info_dict or {})
3313
3314 def run_pp(self, pp, infodict):
3315 files_to_delete = []
3316 if '__files_to_move' not in infodict:
3317 infodict['__files_to_move'] = {}
3318 try:
3319 files_to_delete, infodict = pp.run(infodict)
3320 except PostProcessingError as e:
3321 # Must be True and not 'only_download'
3322 if self.params.get('ignoreerrors') is True:
3323 self.report_error(e)
3324 return infodict
3325 raise
3326
3327 if not files_to_delete:
3328 return infodict
3329 if self.params.get('keepvideo', False):
3330 for f in files_to_delete:
3331 infodict['__files_to_move'].setdefault(f, '')
3332 else:
3333 self._delete_downloaded_files(
3334 *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
3335 return infodict
3336
3337 def run_all_pps(self, key, info, *, additional_pps=None):
3338 self._forceprint(key, info)
3339 for pp in (additional_pps or []) + self._pps[key]:
3340 info = self.run_pp(pp, info)
3341 return info
3342
3343 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3344 info = dict(ie_info)
3345 info['__files_to_move'] = files_to_move or {}
3346 try:
3347 info = self.run_all_pps(key, info)
3348 except PostProcessingError as err:
3349 msg = f'Preprocessing: {err}'
3350 info.setdefault('__pending_error', msg)
3351 self.report_error(msg, is_error=False)
3352 return info, info.pop('__files_to_move', None)
3353
3354 def post_process(self, filename, info, files_to_move=None):
3355 """Run all the postprocessors on the given file."""
3356 info['filepath'] = filename
3357 info['__files_to_move'] = files_to_move or {}
3358 info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
3359 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3360 del info['__files_to_move']
3361 return self.run_all_pps('after_move', info)
3362
3363 def _make_archive_id(self, info_dict):
3364 video_id = info_dict.get('id')
3365 if not video_id:
3366 return
3367 # Future-proof against any change in case
3368 # and backwards compatibility with prior versions
3369 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3370 if extractor is None:
3371 url = str_or_none(info_dict.get('url'))
3372 if not url:
3373 return
3374 # Try to find matching extractor for the URL and take its ie_key
3375 for ie_key, ie in self._ies.items():
3376 if ie.suitable(url):
3377 extractor = ie_key
3378 break
3379 else:
3380 return
3381 return f'{extractor.lower()} {video_id}'
3382
3383 def in_download_archive(self, info_dict):
3384 fn = self.params.get('download_archive')
3385 if fn is None:
3386 return False
3387
3388 vid_id = self._make_archive_id(info_dict)
3389 if not vid_id:
3390 return False # Incomplete video information
3391
3392 return vid_id in self.archive
3393
3394 def record_download_archive(self, info_dict):
3395 fn = self.params.get('download_archive')
3396 if fn is None:
3397 return
3398 vid_id = self._make_archive_id(info_dict)
3399 assert vid_id
3400 self.write_debug(f'Adding to archive: {vid_id}')
3401 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3402 archive_file.write(vid_id + '\n')
3403 self.archive.add(vid_id)
3404
3405 @staticmethod
3406 def format_resolution(format, default='unknown'):
3407 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3408 return 'audio only'
3409 if format.get('resolution') is not None:
3410 return format['resolution']
3411 if format.get('width') and format.get('height'):
3412 return '%dx%d' % (format['width'], format['height'])
3413 elif format.get('height'):
3414 return '%sp' % format['height']
3415 elif format.get('width'):
3416 return '%dx?' % format['width']
3417 return default
3418
3419 def _list_format_headers(self, *headers):
3420 if self.params.get('listformats_table', True) is not False:
3421 return [self._format_out(header, self.Styles.HEADERS) for header in headers]
3422 return headers
3423
3424 def _format_note(self, fdict):
3425 res = ''
3426 if fdict.get('ext') in ['f4f', 'f4m']:
3427 res += '(unsupported)'
3428 if fdict.get('language'):
3429 if res:
3430 res += ' '
3431 res += '[%s]' % fdict['language']
3432 if fdict.get('format_note') is not None:
3433 if res:
3434 res += ' '
3435 res += fdict['format_note']
3436 if fdict.get('tbr') is not None:
3437 if res:
3438 res += ', '
3439 res += '%4dk' % fdict['tbr']
3440 if fdict.get('container') is not None:
3441 if res:
3442 res += ', '
3443 res += '%s container' % fdict['container']
3444 if (fdict.get('vcodec') is not None
3445 and fdict.get('vcodec') != 'none'):
3446 if res:
3447 res += ', '
3448 res += fdict['vcodec']
3449 if fdict.get('vbr') is not None:
3450 res += '@'
3451 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3452 res += 'video@'
3453 if fdict.get('vbr') is not None:
3454 res += '%4dk' % fdict['vbr']
3455 if fdict.get('fps') is not None:
3456 if res:
3457 res += ', '
3458 res += '%sfps' % fdict['fps']
3459 if fdict.get('acodec') is not None:
3460 if res:
3461 res += ', '
3462 if fdict['acodec'] == 'none':
3463 res += 'video only'
3464 else:
3465 res += '%-5s' % fdict['acodec']
3466 elif fdict.get('abr') is not None:
3467 if res:
3468 res += ', '
3469 res += 'audio'
3470 if fdict.get('abr') is not None:
3471 res += '@%3dk' % fdict['abr']
3472 if fdict.get('asr') is not None:
3473 res += ' (%5dHz)' % fdict['asr']
3474 if fdict.get('filesize') is not None:
3475 if res:
3476 res += ', '
3477 res += format_bytes(fdict['filesize'])
3478 elif fdict.get('filesize_approx') is not None:
3479 if res:
3480 res += ', '
3481 res += '~' + format_bytes(fdict['filesize_approx'])
3482 return res
3483
3484 def render_formats_table(self, info_dict):
3485 if not info_dict.get('formats') and not info_dict.get('url'):
3486 return None
3487
3488 formats = info_dict.get('formats', [info_dict])
3489 if not self.params.get('listformats_table', True) is not False:
3490 table = [
3491 [
3492 format_field(f, 'format_id'),
3493 format_field(f, 'ext'),
3494 self.format_resolution(f),
3495 self._format_note(f)
3496 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3497 return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
3498
3499 delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3500 table = [
3501 [
3502 self._format_out(format_field(f, 'format_id'), self.Styles.ID),
3503 format_field(f, 'ext'),
3504 format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
3505 format_field(f, 'fps', '\t%d'),
3506 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3507 delim,
3508 format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
3509 format_field(f, 'tbr', '\t%dk'),
3510 shorten_protocol_name(f.get('protocol', '')),
3511 delim,
3512 format_field(f, 'vcodec', default='unknown').replace(
3513 'none', 'images' if f.get('acodec') == 'none'
3514 else self._format_out('audio only', self.Styles.SUPPRESS)),
3515 format_field(f, 'vbr', '\t%dk'),
3516 format_field(f, 'acodec', default='unknown').replace(
3517 'none', '' if f.get('vcodec') == 'none'
3518 else self._format_out('video only', self.Styles.SUPPRESS)),
3519 format_field(f, 'abr', '\t%dk'),
3520 format_field(f, 'asr', '\t%dHz'),
3521 join_nonempty(
3522 self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
3523 format_field(f, 'language', '[%s]'),
3524 join_nonempty(format_field(f, 'format_note'),
3525 format_field(f, 'container', ignore=(None, f.get('ext'))),
3526 delim=', '),
3527 delim=' '),
3528 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3529 header_line = self._list_format_headers(
3530 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
3531 delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
3532
3533 return render_table(
3534 header_line, table, hide_empty=True,
3535 delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True))
3536
3537 def render_thumbnails_table(self, info_dict):
3538 thumbnails = list(info_dict.get('thumbnails') or [])
3539 if not thumbnails:
3540 return None
3541 return render_table(
3542 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3543 [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
3544
3545 def render_subtitles_table(self, video_id, subtitles):
3546 def _row(lang, formats):
3547 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3548 if len(set(names)) == 1:
3549 names = [] if names[0] == 'unknown' else names[:1]
3550 return [lang, ', '.join(names), ', '.join(exts)]
3551
3552 if not subtitles:
3553 return None
3554 return render_table(
3555 self._list_format_headers('Language', 'Name', 'Formats'),
3556 [_row(lang, formats) for lang, formats in subtitles.items()],
3557 hide_empty=True)
3558
3559 def __list_table(self, video_id, name, func, *args):
3560 table = func(*args)
3561 if not table:
3562 self.to_screen(f'{video_id} has no {name}')
3563 return
3564 self.to_screen(f'[info] Available {name} for {video_id}:')
3565 self.to_stdout(table)
3566
3567 def list_formats(self, info_dict):
3568 self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
3569
3570 def list_thumbnails(self, info_dict):
3571 self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
3572
3573 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3574 self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
3575
3576 def urlopen(self, req):
3577 """ Start an HTTP download """
3578 if isinstance(req, str):
3579 req = sanitized_Request(req)
3580 return self._opener.open(req, timeout=self._socket_timeout)
3581
3582 def print_debug_header(self):
3583 if not self.params.get('verbose'):
3584 return
3585
3586 # These imports can be slow. So import them only as needed
3587 from .extractor.extractors import _LAZY_LOADER
3588 from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors
3589
3590 def get_encoding(stream):
3591 ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
3592 if not supports_terminal_sequences(stream):
3593 from .utils import WINDOWS_VT_MODE # Must be imported locally
3594 ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
3595 return ret
3596
3597 encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % (
3598 locale.getpreferredencoding(),
3599 sys.getfilesystemencoding(),
3600 self.get_encoding(),
3601 ', '.join(
3602 f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_
3603 if stream is not None and key != 'console')
3604 )
3605
3606 logger = self.params.get('logger')
3607 if logger:
3608 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
3609 write_debug(encoding_str)
3610 else:
3611 write_string(f'[debug] {encoding_str}\n', encoding=None)
3612 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
3613
3614 source = detect_variant()
3615 write_debug(join_nonempty(
3616 'yt-dlp version', __version__,
3617 f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
3618 '' if source == 'unknown' else f'({source})',
3619 delim=' '))
3620 if not _LAZY_LOADER:
3621 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
3622 write_debug('Lazy loading extractors is forcibly disabled')
3623 else:
3624 write_debug('Lazy loading extractors is disabled')
3625 if plugin_extractors or plugin_postprocessors:
3626 write_debug('Plugins: %s' % [
3627 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3628 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3629 if self.params['compat_opts']:
3630 write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts']))
3631
3632 if source == 'source':
3633 try:
3634 stdout, _, _ = Popen.run(
3635 ['git', 'rev-parse', '--short', 'HEAD'],
3636 text=True, cwd=os.path.dirname(os.path.abspath(__file__)),
3637 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3638 if re.fullmatch('[0-9a-f]+', stdout.strip()):
3639 write_debug(f'Git HEAD: {stdout.strip()}')
3640 except Exception:
3641 with contextlib.suppress(Exception):
3642 sys.exc_clear()
3643
3644 def python_implementation():
3645 impl_name = platform.python_implementation()
3646 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3647 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3648 return impl_name
3649
3650 write_debug('Python version %s (%s %s) - %s' % (
3651 platform.python_version(),
3652 python_implementation(),
3653 platform.architecture()[0],
3654 platform_name()))
3655
3656 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
3657 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
3658 if ffmpeg_features:
3659 exe_versions['ffmpeg'] += ' (%s)' % ','.join(sorted(ffmpeg_features))
3660
3661 exe_versions['rtmpdump'] = rtmpdump_version()
3662 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3663 exe_str = ', '.join(
3664 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3665 ) or 'none'
3666 write_debug('exe versions: %s' % exe_str)
3667
3668 from .compat.compat_utils import get_package_info
3669 from .dependencies import available_dependencies
3670
3671 write_debug('Optional libraries: %s' % (', '.join(sorted({
3672 join_nonempty(*get_package_info(m)) for m in available_dependencies.values()
3673 })) or 'none'))
3674
3675 self._setup_opener()
3676 proxy_map = {}
3677 for handler in self._opener.handlers:
3678 if hasattr(handler, 'proxies'):
3679 proxy_map.update(handler.proxies)
3680 write_debug(f'Proxy map: {proxy_map}')
3681
3682 # Not implemented
3683 if False and self.params.get('call_home'):
3684 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode()
3685 write_debug('Public IP address: %s' % ipaddr)
3686 latest_version = self.urlopen(
3687 'https://yt-dl.org/latest/version').read().decode()
3688 if version_tuple(latest_version) > version_tuple(__version__):
3689 self.report_warning(
3690 'You are using an outdated version (newest version: %s)! '
3691 'See https://yt-dl.org/update if you need help updating.' %
3692 latest_version)
3693
3694 def _setup_opener(self):
3695 if hasattr(self, '_opener'):
3696 return
3697 timeout_val = self.params.get('socket_timeout')
3698 self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
3699
3700 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3701 opts_cookiefile = self.params.get('cookiefile')
3702 opts_proxy = self.params.get('proxy')
3703
3704 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3705
3706 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3707 if opts_proxy is not None:
3708 if opts_proxy == '':
3709 proxies = {}
3710 else:
3711 proxies = {'http': opts_proxy, 'https': opts_proxy}
3712 else:
3713 proxies = compat_urllib_request.getproxies()
3714 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3715 if 'http' in proxies and 'https' not in proxies:
3716 proxies['https'] = proxies['http']
3717 proxy_handler = PerRequestProxyHandler(proxies)
3718
3719 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3720 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3721 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3722 redirect_handler = YoutubeDLRedirectHandler()
3723 data_handler = urllib.request.DataHandler()
3724
3725 # When passing our own FileHandler instance, build_opener won't add the
3726 # default FileHandler and allows us to disable the file protocol, which
3727 # can be used for malicious purposes (see
3728 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3729 file_handler = compat_urllib_request.FileHandler()
3730
3731 def file_open(*args, **kwargs):
3732 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3733 file_handler.file_open = file_open
3734
3735 opener = compat_urllib_request.build_opener(
3736 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3737
3738 # Delete the default user-agent header, which would otherwise apply in
3739 # cases where our custom HTTP handler doesn't come into play
3740 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3741 opener.addheaders = []
3742 self._opener = opener
3743
3744 def encode(self, s):
3745 if isinstance(s, bytes):
3746 return s # Already encoded
3747
3748 try:
3749 return s.encode(self.get_encoding())
3750 except UnicodeEncodeError as err:
3751 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3752 raise
3753
3754 def get_encoding(self):
3755 encoding = self.params.get('encoding')
3756 if encoding is None:
3757 encoding = preferredencoding()
3758 return encoding
3759
3760 def _write_info_json(self, label, ie_result, infofn, overwrite=None):
3761 ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error '''
3762 if overwrite is None:
3763 overwrite = self.params.get('overwrites', True)
3764 if not self.params.get('writeinfojson'):
3765 return False
3766 elif not infofn:
3767 self.write_debug(f'Skipping writing {label} infojson')
3768 return False
3769 elif not self._ensure_dir_exists(infofn):
3770 return None
3771 elif not overwrite and os.path.exists(infofn):
3772 self.to_screen(f'[info] {label.title()} metadata is already present')
3773 return 'exists'
3774
3775 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3776 try:
3777 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3778 return True
3779 except OSError:
3780 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3781 return None
3782
3783 def _write_description(self, label, ie_result, descfn):
3784 ''' Write description and returns True = written, False = skip, None = error '''
3785 if not self.params.get('writedescription'):
3786 return False
3787 elif not descfn:
3788 self.write_debug(f'Skipping writing {label} description')
3789 return False
3790 elif not self._ensure_dir_exists(descfn):
3791 return None
3792 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3793 self.to_screen(f'[info] {label.title()} description is already present')
3794 elif ie_result.get('description') is None:
3795 self.report_warning(f'There\'s no {label} description to write')
3796 return False
3797 else:
3798 try:
3799 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3800 with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3801 descfile.write(ie_result['description'])
3802 except OSError:
3803 self.report_error(f'Cannot write {label} description file {descfn}')
3804 return None
3805 return True
3806
3807 def _write_subtitles(self, info_dict, filename):
3808 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3809 ret = []
3810 subtitles = info_dict.get('requested_subtitles')
3811 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3812 # subtitles download errors are already managed as troubles in relevant IE
3813 # that way it will silently go on when used with unsupporting IE
3814 return ret
3815
3816 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3817 if not sub_filename_base:
3818 self.to_screen('[info] Skipping writing video subtitles')
3819 return ret
3820 for sub_lang, sub_info in subtitles.items():
3821 sub_format = sub_info['ext']
3822 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3823 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3824 existing_sub = self.existing_file((sub_filename_final, sub_filename))
3825 if existing_sub:
3826 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3827 sub_info['filepath'] = existing_sub
3828 ret.append((existing_sub, sub_filename_final))
3829 continue
3830
3831 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3832 if sub_info.get('data') is not None:
3833 try:
3834 # Use newline='' to prevent conversion of newline characters
3835 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3836 with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3837 subfile.write(sub_info['data'])
3838 sub_info['filepath'] = sub_filename
3839 ret.append((sub_filename, sub_filename_final))
3840 continue
3841 except OSError:
3842 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3843 return None
3844
3845 try:
3846 sub_copy = sub_info.copy()
3847 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3848 self.dl(sub_filename, sub_copy, subtitle=True)
3849 sub_info['filepath'] = sub_filename
3850 ret.append((sub_filename, sub_filename_final))
3851 except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3852 msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
3853 if self.params.get('ignoreerrors') is not True: # False or 'only_download'
3854 if not self.params.get('ignoreerrors'):
3855 self.report_error(msg)
3856 raise DownloadError(msg)
3857 self.report_warning(msg)
3858 return ret
3859
3860 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3861 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3862 write_all = self.params.get('write_all_thumbnails', False)
3863 thumbnails, ret = [], []
3864 if write_all or self.params.get('writethumbnail', False):
3865 thumbnails = info_dict.get('thumbnails') or []
3866 multiple = write_all and len(thumbnails) > 1
3867
3868 if thumb_filename_base is None:
3869 thumb_filename_base = filename
3870 if thumbnails and not thumb_filename_base:
3871 self.write_debug(f'Skipping writing {label} thumbnail')
3872 return ret
3873
3874 for idx, t in list(enumerate(thumbnails))[::-1]:
3875 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3876 thumb_display_id = f'{label} thumbnail {t["id"]}'
3877 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3878 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3879
3880 existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
3881 if existing_thumb:
3882 self.to_screen('[info] %s is already present' % (
3883 thumb_display_id if multiple else f'{label} thumbnail').capitalize())
3884 t['filepath'] = existing_thumb
3885 ret.append((existing_thumb, thumb_filename_final))
3886 else:
3887 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3888 try:
3889 uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {})))
3890 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3891 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3892 shutil.copyfileobj(uf, thumbf)
3893 ret.append((thumb_filename, thumb_filename_final))
3894 t['filepath'] = thumb_filename
3895 except network_exceptions as err:
3896 thumbnails.pop(idx)
3897 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3898 if ret and not write_all:
3899 break
3900 return ret