]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
[cleanup] Misc cleanup
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import functools
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import sys
23 import tempfile
24 import time
25 import tokenize
26 import traceback
27 import random
28 import unicodedata
29
30 from enum import Enum
31 from string import ascii_letters
32
33 from .compat import (
34 compat_basestring,
35 compat_get_terminal_size,
36 compat_kwargs,
37 compat_numeric_types,
38 compat_os_name,
39 compat_pycrypto_AES,
40 compat_shlex_quote,
41 compat_str,
42 compat_tokenize_tokenize,
43 compat_urllib_error,
44 compat_urllib_request,
45 compat_urllib_request_DataHandler,
46 windows_enable_vt_mode,
47 )
48 from .cookies import load_cookies
49 from .utils import (
50 age_restricted,
51 args_to_str,
52 ContentTooShortError,
53 date_from_str,
54 DateRange,
55 DEFAULT_OUTTMPL,
56 determine_ext,
57 determine_protocol,
58 DownloadCancelled,
59 DownloadError,
60 encode_compat_str,
61 encodeFilename,
62 EntryNotInPlaylist,
63 error_to_compat_str,
64 ExistingVideoReached,
65 expand_path,
66 ExtractorError,
67 float_or_none,
68 format_bytes,
69 format_field,
70 format_decimal_suffix,
71 formatSeconds,
72 GeoRestrictedError,
73 get_domain,
74 HEADRequest,
75 int_or_none,
76 iri_to_uri,
77 ISO3166Utils,
78 join_nonempty,
79 LazyList,
80 LINK_TEMPLATES,
81 locked_file,
82 make_dir,
83 make_HTTPS_handler,
84 MaxDownloadsReached,
85 network_exceptions,
86 number_of_digits,
87 orderedSet,
88 OUTTMPL_TYPES,
89 PagedList,
90 parse_filesize,
91 PerRequestProxyHandler,
92 platform_name,
93 Popen,
94 PostProcessingError,
95 preferredencoding,
96 prepend_extension,
97 ReExtractInfo,
98 register_socks_protocols,
99 RejectedVideoReached,
100 remove_terminal_sequences,
101 render_table,
102 replace_extension,
103 SameFileError,
104 sanitize_filename,
105 sanitize_path,
106 sanitize_url,
107 sanitized_Request,
108 std_headers,
109 STR_FORMAT_RE_TMPL,
110 STR_FORMAT_TYPES,
111 str_or_none,
112 strftime_or_none,
113 subtitles_filename,
114 supports_terminal_sequences,
115 timetuple_from_msec,
116 to_high_limit_path,
117 traverse_obj,
118 try_get,
119 UnavailableVideoError,
120 url_basename,
121 variadic,
122 version_tuple,
123 write_json_file,
124 write_string,
125 YoutubeDLCookieProcessor,
126 YoutubeDLHandler,
127 YoutubeDLRedirectHandler,
128 )
129 from .cache import Cache
130 from .minicurses import format_text
131 from .extractor import (
132 gen_extractor_classes,
133 get_info_extractor,
134 _LAZY_LOADER,
135 _PLUGIN_CLASSES as plugin_extractors
136 )
137 from .extractor.openload import PhantomJSwrapper
138 from .downloader import (
139 FFmpegFD,
140 get_suitable_downloader,
141 shorten_protocol_name
142 )
143 from .downloader.rtmp import rtmpdump_version
144 from .postprocessor import (
145 get_postprocessor,
146 EmbedThumbnailPP,
147 FFmpegFixupDuplicateMoovPP,
148 FFmpegFixupDurationPP,
149 FFmpegFixupM3u8PP,
150 FFmpegFixupM4aPP,
151 FFmpegFixupStretchedPP,
152 FFmpegFixupTimestampPP,
153 FFmpegMergerPP,
154 FFmpegPostProcessor,
155 MoveFilesAfterDownloadPP,
156 _PLUGIN_CLASSES as plugin_postprocessors
157 )
158 from .update import detect_variant
159 from .version import __version__, RELEASE_GIT_HEAD
160
161 if compat_os_name == 'nt':
162 import ctypes
163
164
165 class YoutubeDL(object):
166 """YoutubeDL class.
167
168 YoutubeDL objects are the ones responsible of downloading the
169 actual video file and writing it to disk if the user has requested
170 it, among some other tasks. In most cases there should be one per
171 program. As, given a video URL, the downloader doesn't know how to
172 extract all the needed information, task that InfoExtractors do, it
173 has to pass the URL to one of them.
174
175 For this, YoutubeDL objects have a method that allows
176 InfoExtractors to be registered in a given order. When it is passed
177 a URL, the YoutubeDL object handles it to the first InfoExtractor it
178 finds that reports being able to handle it. The InfoExtractor extracts
179 all the information about the video or videos the URL refers to, and
180 YoutubeDL process the extracted information, possibly using a File
181 Downloader to download the video.
182
183 YoutubeDL objects accept a lot of parameters. In order not to saturate
184 the object constructor with arguments, it receives a dictionary of
185 options instead. These options are available through the params
186 attribute for the InfoExtractors to use. The YoutubeDL also
187 registers itself as the downloader in charge for the InfoExtractors
188 that are added to it, so this is a "mutual registration".
189
190 Available options:
191
192 username: Username for authentication purposes.
193 password: Password for authentication purposes.
194 videopassword: Password for accessing a video.
195 ap_mso: Adobe Pass multiple-system operator identifier.
196 ap_username: Multiple-system operator account username.
197 ap_password: Multiple-system operator account password.
198 usenetrc: Use netrc for authentication instead.
199 verbose: Print additional info to stdout.
200 quiet: Do not print messages to stdout.
201 no_warnings: Do not print out anything for warnings.
202 forceprint: A list of templates to force print
203 forceurl: Force printing final URL. (Deprecated)
204 forcetitle: Force printing title. (Deprecated)
205 forceid: Force printing ID. (Deprecated)
206 forcethumbnail: Force printing thumbnail URL. (Deprecated)
207 forcedescription: Force printing description. (Deprecated)
208 forcefilename: Force printing final filename. (Deprecated)
209 forceduration: Force printing duration. (Deprecated)
210 forcejson: Force printing info_dict as JSON.
211 dump_single_json: Force printing the info_dict of the whole playlist
212 (or video) as a single JSON line.
213 force_write_download_archive: Force writing download archive regardless
214 of 'skip_download' or 'simulate'.
215 simulate: Do not download the video files. If unset (or None),
216 simulate only if listsubtitles, listformats or list_thumbnails is used
217 format: Video format code. see "FORMAT SELECTION" for more details.
218 You can also pass a function. The function takes 'ctx' as
219 argument and returns the formats to download.
220 See "build_format_selector" for an implementation
221 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
222 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
223 extracting metadata even if the video is not actually
224 available for download (experimental)
225 format_sort: A list of fields by which to sort the video formats.
226 See "Sorting Formats" for more details.
227 format_sort_force: Force the given format_sort. see "Sorting Formats"
228 for more details.
229 allow_multiple_video_streams: Allow multiple video streams to be merged
230 into a single file
231 allow_multiple_audio_streams: Allow multiple audio streams to be merged
232 into a single file
233 check_formats Whether to test if the formats are downloadable.
234 Can be True (check all), False (check none),
235 'selected' (check selected formats),
236 or None (check only if requested by extractor)
237 paths: Dictionary of output paths. The allowed keys are 'home'
238 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
239 outtmpl: Dictionary of templates for output names. Allowed keys
240 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
241 For compatibility with youtube-dl, a single string can also be used
242 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
243 restrictfilenames: Do not allow "&" and spaces in file names
244 trim_file_name: Limit length of filename (extension excluded)
245 windowsfilenames: Force the filenames to be windows compatible
246 ignoreerrors: Do not stop on download/postprocessing errors.
247 Can be 'only_download' to ignore only download errors.
248 Default is 'only_download' for CLI, but False for API
249 skip_playlist_after_errors: Number of allowed failures until the rest of
250 the playlist is skipped
251 force_generic_extractor: Force downloader to use the generic extractor
252 overwrites: Overwrite all video and metadata files if True,
253 overwrite only non-video files if None
254 and don't overwrite any file if False
255 For compatibility with youtube-dl,
256 "nooverwrites" may also be used instead
257 playliststart: Playlist item to start at.
258 playlistend: Playlist item to end at.
259 playlist_items: Specific indices of playlist to download.
260 playlistreverse: Download playlist items in reverse order.
261 playlistrandom: Download playlist items in random order.
262 matchtitle: Download only matching titles.
263 rejecttitle: Reject downloads for matching titles.
264 logger: Log messages to a logging.Logger instance.
265 logtostderr: Log messages to stderr instead of stdout.
266 consoletitle: Display progress in console window's titlebar.
267 writedescription: Write the video description to a .description file
268 writeinfojson: Write the video description to a .info.json file
269 clean_infojson: Remove private fields from the infojson
270 getcomments: Extract video comments. This will not be written to disk
271 unless writeinfojson is also given
272 writeannotations: Write the video annotations to a .annotations.xml file
273 writethumbnail: Write the thumbnail image to a file
274 allow_playlist_files: Whether to write playlists' description, infojson etc
275 also to disk when using the 'write*' options
276 write_all_thumbnails: Write all thumbnail formats to files
277 writelink: Write an internet shortcut file, depending on the
278 current platform (.url/.webloc/.desktop)
279 writeurllink: Write a Windows internet shortcut file (.url)
280 writewebloclink: Write a macOS internet shortcut file (.webloc)
281 writedesktoplink: Write a Linux internet shortcut file (.desktop)
282 writesubtitles: Write the video subtitles to a file
283 writeautomaticsub: Write the automatically generated subtitles to a file
284 allsubtitles: Deprecated - Use subtitleslangs = ['all']
285 Downloads all the subtitles of the video
286 (requires writesubtitles or writeautomaticsub)
287 listsubtitles: Lists all available subtitles for the video
288 subtitlesformat: The format code for subtitles
289 subtitleslangs: List of languages of the subtitles to download (can be regex).
290 The list may contain "all" to refer to all the available
291 subtitles. The language can be prefixed with a "-" to
292 exclude it from the requested languages. Eg: ['all', '-live_chat']
293 keepvideo: Keep the video file after post-processing
294 daterange: A DateRange object, download only if the upload_date is in the range.
295 skip_download: Skip the actual download of the video file
296 cachedir: Location of the cache files in the filesystem.
297 False to disable filesystem cache.
298 noplaylist: Download single video instead of a playlist if in doubt.
299 age_limit: An integer representing the user's age in years.
300 Unsuitable videos for the given age are skipped.
301 min_views: An integer representing the minimum view count the video
302 must have in order to not be skipped.
303 Videos without view count information are always
304 downloaded. None for no limit.
305 max_views: An integer representing the maximum view count.
306 Videos that are more popular than that are not
307 downloaded.
308 Videos without view count information are always
309 downloaded. None for no limit.
310 download_archive: File name of a file where all downloads are recorded.
311 Videos already present in the file are not downloaded
312 again.
313 break_on_existing: Stop the download process after attempting to download a
314 file that is in the archive.
315 break_on_reject: Stop the download process when encountering a video that
316 has been filtered out.
317 break_per_url: Whether break_on_reject and break_on_existing
318 should act on each input URL as opposed to for the entire queue
319 cookiefile: File name where cookies should be read from and dumped to
320 cookiesfrombrowser: A tuple containing the name of the browser and the profile
321 name/path from where cookies are loaded.
322 Eg: ('chrome', ) or ('vivaldi', 'default')
323 nocheckcertificate:Do not verify SSL certificates
324 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
325 At the moment, this is only supported by YouTube.
326 proxy: URL of the proxy server to use
327 geo_verification_proxy: URL of the proxy to use for IP address verification
328 on geo-restricted sites.
329 socket_timeout: Time to wait for unresponsive hosts, in seconds
330 bidi_workaround: Work around buggy terminals without bidirectional text
331 support, using fridibi
332 debug_printtraffic:Print out sent and received HTTP traffic
333 include_ads: Download ads as well (deprecated)
334 default_search: Prepend this string if an input url is not valid.
335 'auto' for elaborate guessing
336 encoding: Use this encoding instead of the system-specified.
337 extract_flat: Do not resolve URLs, return the immediate result.
338 Pass in 'in_playlist' to only show this behavior for
339 playlist items.
340 wait_for_video: If given, wait for scheduled streams to become available.
341 The value should be a tuple containing the range
342 (min_secs, max_secs) to wait between retries
343 postprocessors: A list of dictionaries, each with an entry
344 * key: The name of the postprocessor. See
345 yt_dlp/postprocessor/__init__.py for a list.
346 * when: When to run the postprocessor. Can be one of
347 pre_process|before_dl|post_process|after_move.
348 Assumed to be 'post_process' if not given
349 post_hooks: Deprecated - Register a custom postprocessor instead
350 A list of functions that get called as the final step
351 for each video file, after all postprocessors have been
352 called. The filename will be passed as the only argument.
353 progress_hooks: A list of functions that get called on download
354 progress, with a dictionary with the entries
355 * status: One of "downloading", "error", or "finished".
356 Check this first and ignore unknown values.
357 * info_dict: The extracted info_dict
358
359 If status is one of "downloading", or "finished", the
360 following properties may also be present:
361 * filename: The final filename (always present)
362 * tmpfilename: The filename we're currently writing to
363 * downloaded_bytes: Bytes on disk
364 * total_bytes: Size of the whole file, None if unknown
365 * total_bytes_estimate: Guess of the eventual file size,
366 None if unavailable.
367 * elapsed: The number of seconds since download started.
368 * eta: The estimated time in seconds, None if unknown
369 * speed: The download speed in bytes/second, None if
370 unknown
371 * fragment_index: The counter of the currently
372 downloaded video fragment.
373 * fragment_count: The number of fragments (= individual
374 files that will be merged)
375
376 Progress hooks are guaranteed to be called at least once
377 (with status "finished") if the download is successful.
378 postprocessor_hooks: A list of functions that get called on postprocessing
379 progress, with a dictionary with the entries
380 * status: One of "started", "processing", or "finished".
381 Check this first and ignore unknown values.
382 * postprocessor: Name of the postprocessor
383 * info_dict: The extracted info_dict
384
385 Progress hooks are guaranteed to be called at least twice
386 (with status "started" and "finished") if the processing is successful.
387 merge_output_format: Extension to use when merging formats.
388 final_ext: Expected final extension; used to detect when the file was
389 already downloaded and converted
390 fixup: Automatically correct known faults of the file.
391 One of:
392 - "never": do nothing
393 - "warn": only emit a warning
394 - "detect_or_warn": check whether we can do anything
395 about it, warn otherwise (default)
396 source_address: Client-side IP address to bind to.
397 call_home: Boolean, true iff we are allowed to contact the
398 yt-dlp servers for debugging. (BROKEN)
399 sleep_interval_requests: Number of seconds to sleep between requests
400 during extraction
401 sleep_interval: Number of seconds to sleep before each download when
402 used alone or a lower bound of a range for randomized
403 sleep before each download (minimum possible number
404 of seconds to sleep) when used along with
405 max_sleep_interval.
406 max_sleep_interval:Upper bound of a range for randomized sleep before each
407 download (maximum possible number of seconds to sleep).
408 Must only be used along with sleep_interval.
409 Actual sleep time will be a random float from range
410 [sleep_interval; max_sleep_interval].
411 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
412 listformats: Print an overview of available video formats and exit.
413 list_thumbnails: Print a table of all thumbnails and exit.
414 match_filter: A function that gets called with the info_dict of
415 every video.
416 If it returns a message, the video is ignored.
417 If it returns None, the video is downloaded.
418 match_filter_func in utils.py is one example for this.
419 no_color: Do not emit color codes in output.
420 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
421 HTTP header
422 geo_bypass_country:
423 Two-letter ISO 3166-2 country code that will be used for
424 explicit geographic restriction bypassing via faking
425 X-Forwarded-For HTTP header
426 geo_bypass_ip_block:
427 IP range in CIDR notation that will be used similarly to
428 geo_bypass_country
429
430 The following options determine which downloader is picked:
431 external_downloader: A dictionary of protocol keys and the executable of the
432 external downloader to use for it. The allowed protocols
433 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
434 Set the value to 'native' to use the native downloader
435 hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
436 or {'m3u8': 'ffmpeg'} instead.
437 Use the native HLS downloader instead of ffmpeg/avconv
438 if True, otherwise use ffmpeg/avconv if False, otherwise
439 use downloader suggested by extractor if None.
440 compat_opts: Compatibility options. See "Differences in default behavior".
441 The following options do not work when used through the API:
442 filename, abort-on-error, multistreams, no-live-chat, format-sort
443 no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json.
444 Refer __init__.py for their implementation
445 progress_template: Dictionary of templates for progress outputs.
446 Allowed keys are 'download', 'postprocess',
447 'download-title' (console title) and 'postprocess-title'.
448 The template is mapped on a dictionary with keys 'progress' and 'info'
449
450 The following parameters are not used by YoutubeDL itself, they are used by
451 the downloader (see yt_dlp/downloader/common.py):
452 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
453 max_filesize, test, noresizebuffer, retries, fragment_retries, continuedl,
454 noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
455 external_downloader_args, concurrent_fragment_downloads.
456
457 The following options are used by the post processors:
458 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
459 otherwise prefer ffmpeg. (avconv support is deprecated)
460 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
461 to the binary or its containing directory.
462 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
463 and a list of additional command-line arguments for the
464 postprocessor/executable. The dict can also have "PP+EXE" keys
465 which are used when the given exe is used by the given PP.
466 Use 'default' as the name for arguments to passed to all PP
467 For compatibility with youtube-dl, a single list of args
468 can also be used
469
470 The following options are used by the extractors:
471 extractor_retries: Number of times to retry for known errors
472 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
473 hls_split_discontinuity: Split HLS playlists to different formats at
474 discontinuities such as ad breaks (default: False)
475 extractor_args: A dictionary of arguments to be passed to the extractors.
476 See "EXTRACTOR ARGUMENTS" for details.
477 Eg: {'youtube': {'skip': ['dash', 'hls']}}
478 youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
479 If True (default), DASH manifests and related
480 data will be downloaded and processed by extractor.
481 You can reduce network I/O by disabling it if you don't
482 care about DASH. (only for youtube)
483 youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
484 If True (default), HLS manifests and related
485 data will be downloaded and processed by extractor.
486 You can reduce network I/O by disabling it if you don't
487 care about HLS. (only for youtube)
488 """
489
490 _NUMERIC_FIELDS = set((
491 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
492 'timestamp', 'release_timestamp',
493 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
494 'average_rating', 'comment_count', 'age_limit',
495 'start_time', 'end_time',
496 'chapter_number', 'season_number', 'episode_number',
497 'track_number', 'disc_number', 'release_year',
498 ))
499
500 _format_selection_exts = {
501 'audio': {'m4a', 'mp3', 'ogg', 'aac'},
502 'video': {'mp4', 'flv', 'webm', '3gp'},
503 'storyboards': {'mhtml'},
504 }
505
506 params = None
507 _ies = {}
508 _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
509 _printed_messages = set()
510 _first_webpage_request = True
511 _download_retcode = None
512 _num_downloads = None
513 _playlist_level = 0
514 _playlist_urls = set()
515 _screen_file = None
516
517 def __init__(self, params=None, auto_init=True):
518 """Create a FileDownloader object with the given options.
519 @param auto_init Whether to load the default extractors and print header (if verbose).
520 Set to 'no_verbose_header' to not print the header
521 """
522 if params is None:
523 params = {}
524 self._ies = {}
525 self._ies_instances = {}
526 self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
527 self._printed_messages = set()
528 self._first_webpage_request = True
529 self._post_hooks = []
530 self._progress_hooks = []
531 self._postprocessor_hooks = []
532 self._download_retcode = 0
533 self._num_downloads = 0
534 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
535 self._err_file = sys.stderr
536 self.params = params
537 self.cache = Cache(self)
538
539 windows_enable_vt_mode()
540 self._allow_colors = {
541 'screen': not self.params.get('no_color') and supports_terminal_sequences(self._screen_file),
542 'err': not self.params.get('no_color') and supports_terminal_sequences(self._err_file),
543 }
544
545 if sys.version_info < (3, 6):
546 self.report_warning(
547 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
548
549 if self.params.get('allow_unplayable_formats'):
550 self.report_warning(
551 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
552 'This is a developer option intended for debugging. \n'
553 ' If you experience any issues while using this option, '
554 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
555
556 def check_deprecated(param, option, suggestion):
557 if self.params.get(param) is not None:
558 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
559 return True
560 return False
561
562 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
563 if self.params.get('geo_verification_proxy') is None:
564 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
565
566 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
567 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
568 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
569
570 for msg in self.params.get('_warnings', []):
571 self.report_warning(msg)
572 for msg in self.params.get('_deprecation_warnings', []):
573 self.deprecation_warning(msg)
574
575 if 'list-formats' in self.params.get('compat_opts', []):
576 self.params['listformats_table'] = False
577
578 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
579 # nooverwrites was unnecessarily changed to overwrites
580 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
581 # This ensures compatibility with both keys
582 self.params['overwrites'] = not self.params['nooverwrites']
583 elif self.params.get('overwrites') is None:
584 self.params.pop('overwrites', None)
585 else:
586 self.params['nooverwrites'] = not self.params['overwrites']
587
588 if params.get('bidi_workaround', False):
589 try:
590 import pty
591 master, slave = pty.openpty()
592 width = compat_get_terminal_size().columns
593 if width is None:
594 width_args = []
595 else:
596 width_args = ['-w', str(width)]
597 sp_kwargs = dict(
598 stdin=subprocess.PIPE,
599 stdout=slave,
600 stderr=self._err_file)
601 try:
602 self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
603 except OSError:
604 self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
605 self._output_channel = os.fdopen(master, 'rb')
606 except OSError as ose:
607 if ose.errno == errno.ENOENT:
608 self.report_warning(
609 'Could not find fribidi executable, ignoring --bidi-workaround. '
610 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
611 else:
612 raise
613
614 if (sys.platform != 'win32'
615 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
616 and not params.get('restrictfilenames', False)):
617 # Unicode filesystem API will throw errors (#1474, #13027)
618 self.report_warning(
619 'Assuming --restrict-filenames since file system encoding '
620 'cannot encode all characters. '
621 'Set the LC_ALL environment variable to fix this.')
622 self.params['restrictfilenames'] = True
623
624 self.outtmpl_dict = self.parse_outtmpl()
625
626 # Creating format selector here allows us to catch syntax errors before the extraction
627 self.format_selector = (
628 self.params.get('format') if self.params.get('format') in (None, '-')
629 else self.params['format'] if callable(self.params['format'])
630 else self.build_format_selector(self.params['format']))
631
632 self._setup_opener()
633
634 if auto_init:
635 if auto_init != 'no_verbose_header':
636 self.print_debug_header()
637 self.add_default_info_extractors()
638
639 hooks = {
640 'post_hooks': self.add_post_hook,
641 'progress_hooks': self.add_progress_hook,
642 'postprocessor_hooks': self.add_postprocessor_hook,
643 }
644 for opt, fn in hooks.items():
645 for ph in self.params.get(opt, []):
646 fn(ph)
647
648 for pp_def_raw in self.params.get('postprocessors', []):
649 pp_def = dict(pp_def_raw)
650 when = pp_def.pop('when', 'post_process')
651 self.add_post_processor(
652 get_postprocessor(pp_def.pop('key'))(self, **compat_kwargs(pp_def)),
653 when=when)
654
655 register_socks_protocols()
656
657 def preload_download_archive(fn):
658 """Preload the archive, if any is specified"""
659 if fn is None:
660 return False
661 self.write_debug(f'Loading archive file {fn!r}')
662 try:
663 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
664 for line in archive_file:
665 self.archive.add(line.strip())
666 except IOError as ioe:
667 if ioe.errno != errno.ENOENT:
668 raise
669 return False
670 return True
671
672 self.archive = set()
673 preload_download_archive(self.params.get('download_archive'))
674
675 def warn_if_short_id(self, argv):
676 # short YouTube ID starting with dash?
677 idxs = [
678 i for i, a in enumerate(argv)
679 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
680 if idxs:
681 correct_argv = (
682 ['yt-dlp']
683 + [a for i, a in enumerate(argv) if i not in idxs]
684 + ['--'] + [argv[i] for i in idxs]
685 )
686 self.report_warning(
687 'Long argument string detected. '
688 'Use -- to separate parameters and URLs, like this:\n%s' %
689 args_to_str(correct_argv))
690
691 def add_info_extractor(self, ie):
692 """Add an InfoExtractor object to the end of the list."""
693 ie_key = ie.ie_key()
694 self._ies[ie_key] = ie
695 if not isinstance(ie, type):
696 self._ies_instances[ie_key] = ie
697 ie.set_downloader(self)
698
699 def _get_info_extractor_class(self, ie_key):
700 ie = self._ies.get(ie_key)
701 if ie is None:
702 ie = get_info_extractor(ie_key)
703 self.add_info_extractor(ie)
704 return ie
705
706 def get_info_extractor(self, ie_key):
707 """
708 Get an instance of an IE with name ie_key, it will try to get one from
709 the _ies list, if there's no instance it will create a new one and add
710 it to the extractor list.
711 """
712 ie = self._ies_instances.get(ie_key)
713 if ie is None:
714 ie = get_info_extractor(ie_key)()
715 self.add_info_extractor(ie)
716 return ie
717
718 def add_default_info_extractors(self):
719 """
720 Add the InfoExtractors returned by gen_extractors to the end of the list
721 """
722 for ie in gen_extractor_classes():
723 self.add_info_extractor(ie)
724
725 def add_post_processor(self, pp, when='post_process'):
726 """Add a PostProcessor object to the end of the chain."""
727 self._pps[when].append(pp)
728 pp.set_downloader(self)
729
730 def add_post_hook(self, ph):
731 """Add the post hook"""
732 self._post_hooks.append(ph)
733
734 def add_progress_hook(self, ph):
735 """Add the download progress hook"""
736 self._progress_hooks.append(ph)
737
738 def add_postprocessor_hook(self, ph):
739 """Add the postprocessing progress hook"""
740 self._postprocessor_hooks.append(ph)
741 for pps in self._pps.values():
742 for pp in pps:
743 pp.add_progress_hook(ph)
744
745 def _bidi_workaround(self, message):
746 if not hasattr(self, '_output_channel'):
747 return message
748
749 assert hasattr(self, '_output_process')
750 assert isinstance(message, compat_str)
751 line_count = message.count('\n') + 1
752 self._output_process.stdin.write((message + '\n').encode('utf-8'))
753 self._output_process.stdin.flush()
754 res = ''.join(self._output_channel.readline().decode('utf-8')
755 for _ in range(line_count))
756 return res[:-len('\n')]
757
758 def _write_string(self, message, out=None, only_once=False):
759 if only_once:
760 if message in self._printed_messages:
761 return
762 self._printed_messages.add(message)
763 write_string(message, out=out, encoding=self.params.get('encoding'))
764
765 def to_stdout(self, message, skip_eol=False, quiet=False):
766 """Print message to stdout"""
767 if self.params.get('logger'):
768 self.params['logger'].debug(message)
769 elif not quiet or self.params.get('verbose'):
770 self._write_string(
771 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
772 self._err_file if quiet else self._screen_file)
773
774 def to_stderr(self, message, only_once=False):
775 """Print message to stderr"""
776 assert isinstance(message, compat_str)
777 if self.params.get('logger'):
778 self.params['logger'].error(message)
779 else:
780 self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
781
782 def to_console_title(self, message):
783 if not self.params.get('consoletitle', False):
784 return
785 message = remove_terminal_sequences(message)
786 if compat_os_name == 'nt':
787 if ctypes.windll.kernel32.GetConsoleWindow():
788 # c_wchar_p() might not be necessary if `message` is
789 # already of type unicode()
790 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
791 elif 'TERM' in os.environ:
792 self._write_string('\033]0;%s\007' % message, self._screen_file)
793
794 def save_console_title(self):
795 if not self.params.get('consoletitle', False):
796 return
797 if self.params.get('simulate'):
798 return
799 if compat_os_name != 'nt' and 'TERM' in os.environ:
800 # Save the title on stack
801 self._write_string('\033[22;0t', self._screen_file)
802
803 def restore_console_title(self):
804 if not self.params.get('consoletitle', False):
805 return
806 if self.params.get('simulate'):
807 return
808 if compat_os_name != 'nt' and 'TERM' in os.environ:
809 # Restore the title from stack
810 self._write_string('\033[23;0t', self._screen_file)
811
812 def __enter__(self):
813 self.save_console_title()
814 return self
815
816 def __exit__(self, *args):
817 self.restore_console_title()
818
819 if self.params.get('cookiefile') is not None:
820 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
821
822 def trouble(self, message=None, tb=None, is_error=True):
823 """Determine action to take when a download problem appears.
824
825 Depending on if the downloader has been configured to ignore
826 download errors or not, this method may throw an exception or
827 not when errors are found, after printing the message.
828
829 @param tb If given, is additional traceback information
830 @param is_error Whether to raise error according to ignorerrors
831 """
832 if message is not None:
833 self.to_stderr(message)
834 if self.params.get('verbose'):
835 if tb is None:
836 if sys.exc_info()[0]: # if .trouble has been called from an except block
837 tb = ''
838 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
839 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
840 tb += encode_compat_str(traceback.format_exc())
841 else:
842 tb_data = traceback.format_list(traceback.extract_stack())
843 tb = ''.join(tb_data)
844 if tb:
845 self.to_stderr(tb)
846 if not is_error:
847 return
848 if not self.params.get('ignoreerrors'):
849 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
850 exc_info = sys.exc_info()[1].exc_info
851 else:
852 exc_info = sys.exc_info()
853 raise DownloadError(message, exc_info)
854 self._download_retcode = 1
855
856 def to_screen(self, message, skip_eol=False):
857 """Print message to stdout if not in quiet mode"""
858 self.to_stdout(
859 message, skip_eol, quiet=self.params.get('quiet', False))
860
861 class Styles(Enum):
862 HEADERS = 'yellow'
863 EMPHASIS = 'light blue'
864 ID = 'green'
865 DELIM = 'blue'
866 ERROR = 'red'
867 WARNING = 'yellow'
868 SUPPRESS = 'light black'
869
870 def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
871 if test_encoding:
872 original_text = text
873 encoding = self.params.get('encoding') or getattr(handle, 'encoding', 'ascii')
874 text = text.encode(encoding, 'ignore').decode(encoding)
875 if fallback is not None and text != original_text:
876 text = fallback
877 if isinstance(f, self.Styles):
878 f = f.value
879 return format_text(text, f) if allow_colors else text if fallback is None else fallback
880
881 def _format_screen(self, *args, **kwargs):
882 return self._format_text(
883 self._screen_file, self._allow_colors['screen'], *args, **kwargs)
884
885 def _format_err(self, *args, **kwargs):
886 return self._format_text(
887 self._err_file, self._allow_colors['err'], *args, **kwargs)
888
889 def report_warning(self, message, only_once=False):
890 '''
891 Print the message to stderr, it will be prefixed with 'WARNING:'
892 If stderr is a tty file the 'WARNING:' will be colored
893 '''
894 if self.params.get('logger') is not None:
895 self.params['logger'].warning(message)
896 else:
897 if self.params.get('no_warnings'):
898 return
899 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
900
901 def deprecation_warning(self, message):
902 if self.params.get('logger') is not None:
903 self.params['logger'].warning('DeprecationWarning: {message}')
904 else:
905 self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True)
906
907 def report_error(self, message, *args, **kwargs):
908 '''
909 Do the same as trouble, but prefixes the message with 'ERROR:', colored
910 in red if stderr is a tty file.
911 '''
912 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
913
914 def write_debug(self, message, only_once=False):
915 '''Log debug message or Print message to stderr'''
916 if not self.params.get('verbose', False):
917 return
918 message = '[debug] %s' % message
919 if self.params.get('logger'):
920 self.params['logger'].debug(message)
921 else:
922 self.to_stderr(message, only_once)
923
924 def report_file_already_downloaded(self, file_name):
925 """Report file has already been fully downloaded."""
926 try:
927 self.to_screen('[download] %s has already been downloaded' % file_name)
928 except UnicodeEncodeError:
929 self.to_screen('[download] The file has already been downloaded')
930
931 def report_file_delete(self, file_name):
932 """Report that existing file will be deleted."""
933 try:
934 self.to_screen('Deleting existing file %s' % file_name)
935 except UnicodeEncodeError:
936 self.to_screen('Deleting existing file')
937
938 def raise_no_formats(self, info, forced=False):
939 has_drm = info.get('__has_drm')
940 msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
941 expected = self.params.get('ignore_no_formats_error')
942 if forced or not expected:
943 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
944 expected=has_drm or expected)
945 else:
946 self.report_warning(msg)
947
948 def parse_outtmpl(self):
949 outtmpl_dict = self.params.get('outtmpl', {})
950 if not isinstance(outtmpl_dict, dict):
951 outtmpl_dict = {'default': outtmpl_dict}
952 # Remove spaces in the default template
953 if self.params.get('restrictfilenames'):
954 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
955 else:
956 sanitize = lambda x: x
957 outtmpl_dict.update({
958 k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items()
959 if outtmpl_dict.get(k) is None})
960 for key, val in outtmpl_dict.items():
961 if isinstance(val, bytes):
962 self.report_warning(
963 'Parameter outtmpl is bytes, but should be a unicode string. '
964 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
965 return outtmpl_dict
966
967 def get_output_path(self, dir_type='', filename=None):
968 paths = self.params.get('paths', {})
969 assert isinstance(paths, dict)
970 path = os.path.join(
971 expand_path(paths.get('home', '').strip()),
972 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
973 filename or '')
974
975 # Temporary fix for #4787
976 # 'Treat' all problem characters by passing filename through preferredencoding
977 # to workaround encoding issues with subprocess on python2 @ Windows
978 if sys.version_info < (3, 0) and sys.platform == 'win32':
979 path = encodeFilename(path, True).decode(preferredencoding())
980 return sanitize_path(path, force=self.params.get('windowsfilenames'))
981
982 @staticmethod
983 def _outtmpl_expandpath(outtmpl):
984 # expand_path translates '%%' into '%' and '$$' into '$'
985 # correspondingly that is not what we want since we need to keep
986 # '%%' intact for template dict substitution step. Working around
987 # with boundary-alike separator hack.
988 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
989 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
990
991 # outtmpl should be expand_path'ed before template dict substitution
992 # because meta fields may contain env variables we don't want to
993 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
994 # title "Hello $PATH", we don't want `$PATH` to be expanded.
995 return expand_path(outtmpl).replace(sep, '')
996
997 @staticmethod
998 def escape_outtmpl(outtmpl):
999 ''' Escape any remaining strings like %s, %abc% etc. '''
1000 return re.sub(
1001 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
1002 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
1003 outtmpl)
1004
1005 @classmethod
1006 def validate_outtmpl(cls, outtmpl):
1007 ''' @return None or Exception object '''
1008 outtmpl = re.sub(
1009 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDF]'),
1010 lambda mobj: f'{mobj.group(0)[:-1]}s',
1011 cls._outtmpl_expandpath(outtmpl))
1012 try:
1013 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
1014 return None
1015 except ValueError as err:
1016 return err
1017
1018 @staticmethod
1019 def _copy_infodict(info_dict):
1020 info_dict = dict(info_dict)
1021 for key in ('__original_infodict', '__postprocessors'):
1022 info_dict.pop(key, None)
1023 return info_dict
1024
1025 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
1026 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
1027 @param sanitize Whether to sanitize the output as a filename.
1028 For backward compatibility, a function can also be passed
1029 """
1030
1031 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1032
1033 info_dict = self._copy_infodict(info_dict)
1034 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1035 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1036 if info_dict.get('duration', None) is not None
1037 else None)
1038 info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
1039 if info_dict.get('resolution') is None:
1040 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1041
1042 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1043 # of %(field)s to %(field)0Nd for backward compatibility
1044 field_size_compat_map = {
1045 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0),
1046 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1047 'autonumber': self.params.get('autonumber_size') or 5,
1048 }
1049
1050 TMPL_DICT = {}
1051 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDF]'))
1052 MATH_FUNCTIONS = {
1053 '+': float.__add__,
1054 '-': float.__sub__,
1055 }
1056 # Field is of the form key1.key2...
1057 # where keys (except first) can be string, int or slice
1058 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
1059 MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
1060 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
1061 INTERNAL_FORMAT_RE = re.compile(r'''(?x)
1062 (?P<negate>-)?
1063 (?P<fields>{field})
1064 (?P<maths>(?:{math_op}{math_field})*)
1065 (?:>(?P<strf_format>.+?))?
1066 (?P<alternate>(?<!\\),[^|&)]+)?
1067 (?:&(?P<replacement>.*?))?
1068 (?:\|(?P<default>.*?))?
1069 $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
1070
1071 def _traverse_infodict(k):
1072 k = k.split('.')
1073 if k[0] == '':
1074 k.pop(0)
1075 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
1076
1077 def get_value(mdict):
1078 # Object traversal
1079 value = _traverse_infodict(mdict['fields'])
1080 # Negative
1081 if mdict['negate']:
1082 value = float_or_none(value)
1083 if value is not None:
1084 value *= -1
1085 # Do maths
1086 offset_key = mdict['maths']
1087 if offset_key:
1088 value = float_or_none(value)
1089 operator = None
1090 while offset_key:
1091 item = re.match(
1092 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1093 offset_key).group(0)
1094 offset_key = offset_key[len(item):]
1095 if operator is None:
1096 operator = MATH_FUNCTIONS[item]
1097 continue
1098 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1099 offset = float_or_none(item)
1100 if offset is None:
1101 offset = float_or_none(_traverse_infodict(item))
1102 try:
1103 value = operator(value, multiplier * offset)
1104 except (TypeError, ZeroDivisionError):
1105 return None
1106 operator = None
1107 # Datetime formatting
1108 if mdict['strf_format']:
1109 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1110
1111 return value
1112
1113 na = self.params.get('outtmpl_na_placeholder', 'NA')
1114
1115 def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
1116 return sanitize_filename(str(value), restricted=restricted,
1117 is_id=re.search(r'(^|[_.])id(\.|$)', key))
1118
1119 sanitizer = sanitize if callable(sanitize) else filename_sanitizer
1120 sanitize = bool(sanitize)
1121
1122 def _dumpjson_default(obj):
1123 if isinstance(obj, (set, LazyList)):
1124 return list(obj)
1125 return repr(obj)
1126
1127 def create_key(outer_mobj):
1128 if not outer_mobj.group('has_key'):
1129 return outer_mobj.group(0)
1130 key = outer_mobj.group('key')
1131 mobj = re.match(INTERNAL_FORMAT_RE, key)
1132 initial_field = mobj.group('fields') if mobj else ''
1133 value, replacement, default = None, None, na
1134 while mobj:
1135 mobj = mobj.groupdict()
1136 default = mobj['default'] if mobj['default'] is not None else default
1137 value = get_value(mobj)
1138 replacement = mobj['replacement']
1139 if value is None and mobj['alternate']:
1140 mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:])
1141 else:
1142 break
1143
1144 fmt = outer_mobj.group('format')
1145 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1146 fmt = '0{:d}d'.format(field_size_compat_map[key])
1147
1148 value = default if value is None else value if replacement is None else replacement
1149
1150 flags = outer_mobj.group('conversion') or ''
1151 str_fmt = f'{fmt[:-1]}s'
1152 if fmt[-1] == 'l': # list
1153 delim = '\n' if '#' in flags else ', '
1154 value, fmt = delim.join(variadic(value)), str_fmt
1155 elif fmt[-1] == 'j': # json
1156 value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
1157 elif fmt[-1] == 'q': # quoted
1158 value = map(str, variadic(value) if '#' in flags else [value])
1159 value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
1160 elif fmt[-1] == 'B': # bytes
1161 value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
1162 value, fmt = value.decode('utf-8', 'ignore'), 's'
1163 elif fmt[-1] == 'U': # unicode normalized
1164 value, fmt = unicodedata.normalize(
1165 # "+" = compatibility equivalence, "#" = NFD
1166 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1167 value), str_fmt
1168 elif fmt[-1] == 'D': # decimal suffix
1169 value, fmt = format_decimal_suffix(value, f'%{fmt[:-1]}f%s' if fmt[:-1] else '%d%s'), 's'
1170 elif fmt[-1] == 'F': # filename sanitization
1171 value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
1172 elif fmt[-1] == 'c':
1173 if value:
1174 value = str(value)[0]
1175 else:
1176 fmt = str_fmt
1177 elif fmt[-1] not in 'rs': # numeric
1178 value = float_or_none(value)
1179 if value is None:
1180 value, fmt = default, 's'
1181
1182 if sanitize:
1183 if fmt[-1] == 'r':
1184 # If value is an object, sanitize might convert it to a string
1185 # So we convert it to repr first
1186 value, fmt = repr(value), str_fmt
1187 if fmt[-1] in 'csr':
1188 value = sanitizer(initial_field, value)
1189
1190 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1191 TMPL_DICT[key] = value
1192 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1193
1194 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1195
1196 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1197 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1198 return self.escape_outtmpl(outtmpl) % info_dict
1199
1200 def _prepare_filename(self, info_dict, tmpl_type='default'):
1201 try:
1202 outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']))
1203 filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
1204
1205 force_ext = OUTTMPL_TYPES.get(tmpl_type)
1206 if filename and force_ext is not None:
1207 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1208
1209 # https://github.com/blackjack4494/youtube-dlc/issues/85
1210 trim_file_name = self.params.get('trim_file_name', False)
1211 if trim_file_name:
1212 no_ext, *ext = filename.rsplit('.', 2)
1213 filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
1214
1215 return filename
1216 except ValueError as err:
1217 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1218 return None
1219
1220 def prepare_filename(self, info_dict, dir_type='', warn=False):
1221 """Generate the output filename."""
1222
1223 filename = self._prepare_filename(info_dict, dir_type or 'default')
1224 if not filename and dir_type not in ('', 'temp'):
1225 return ''
1226
1227 if warn:
1228 if not self.params.get('paths'):
1229 pass
1230 elif filename == '-':
1231 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1232 elif os.path.isabs(filename):
1233 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1234 if filename == '-' or not filename:
1235 return filename
1236
1237 return self.get_output_path(dir_type, filename)
1238
1239 def _match_entry(self, info_dict, incomplete=False, silent=False):
1240 """ Returns None if the file should be downloaded """
1241
1242 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1243
1244 def check_filter():
1245 if 'title' in info_dict:
1246 # This can happen when we're just evaluating the playlist
1247 title = info_dict['title']
1248 matchtitle = self.params.get('matchtitle', False)
1249 if matchtitle:
1250 if not re.search(matchtitle, title, re.IGNORECASE):
1251 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1252 rejecttitle = self.params.get('rejecttitle', False)
1253 if rejecttitle:
1254 if re.search(rejecttitle, title, re.IGNORECASE):
1255 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1256 date = info_dict.get('upload_date')
1257 if date is not None:
1258 dateRange = self.params.get('daterange', DateRange())
1259 if date not in dateRange:
1260 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1261 view_count = info_dict.get('view_count')
1262 if view_count is not None:
1263 min_views = self.params.get('min_views')
1264 if min_views is not None and view_count < min_views:
1265 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1266 max_views = self.params.get('max_views')
1267 if max_views is not None and view_count > max_views:
1268 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1269 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1270 return 'Skipping "%s" because it is age restricted' % video_title
1271
1272 match_filter = self.params.get('match_filter')
1273 if match_filter is not None:
1274 try:
1275 ret = match_filter(info_dict, incomplete=incomplete)
1276 except TypeError:
1277 # For backward compatibility
1278 ret = None if incomplete else match_filter(info_dict)
1279 if ret is not None:
1280 return ret
1281 return None
1282
1283 if self.in_download_archive(info_dict):
1284 reason = '%s has already been recorded in the archive' % video_title
1285 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1286 else:
1287 reason = check_filter()
1288 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1289 if reason is not None:
1290 if not silent:
1291 self.to_screen('[download] ' + reason)
1292 if self.params.get(break_opt, False):
1293 raise break_err()
1294 return reason
1295
1296 @staticmethod
1297 def add_extra_info(info_dict, extra_info):
1298 '''Set the keys from extra_info in info dict if they are missing'''
1299 for key, value in extra_info.items():
1300 info_dict.setdefault(key, value)
1301
1302 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1303 process=True, force_generic_extractor=False):
1304 """
1305 Return a list with a dictionary for each video extracted.
1306
1307 Arguments:
1308 url -- URL to extract
1309
1310 Keyword arguments:
1311 download -- whether to download videos during extraction
1312 ie_key -- extractor key hint
1313 extra_info -- dictionary containing the extra values to add to each result
1314 process -- whether to resolve all unresolved references (URLs, playlist items),
1315 must be True for download to work.
1316 force_generic_extractor -- force using the generic extractor
1317 """
1318
1319 if extra_info is None:
1320 extra_info = {}
1321
1322 if not ie_key and force_generic_extractor:
1323 ie_key = 'Generic'
1324
1325 if ie_key:
1326 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1327 else:
1328 ies = self._ies
1329
1330 for ie_key, ie in ies.items():
1331 if not ie.suitable(url):
1332 continue
1333
1334 if not ie.working():
1335 self.report_warning('The program functionality for this site has been marked as broken, '
1336 'and will probably not work.')
1337
1338 temp_id = ie.get_temp_id(url)
1339 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1340 self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
1341 if self.params.get('break_on_existing', False):
1342 raise ExistingVideoReached()
1343 break
1344 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1345 else:
1346 self.report_error('no suitable InfoExtractor for URL %s' % url)
1347
1348 def __handle_extraction_exceptions(func):
1349 @functools.wraps(func)
1350 def wrapper(self, *args, **kwargs):
1351 try:
1352 return func(self, *args, **kwargs)
1353 except GeoRestrictedError as e:
1354 msg = e.msg
1355 if e.countries:
1356 msg += '\nThis video is available in %s.' % ', '.join(
1357 map(ISO3166Utils.short2full, e.countries))
1358 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1359 self.report_error(msg)
1360 except ExtractorError as e: # An error we somewhat expected
1361 self.report_error(compat_str(e), e.format_traceback())
1362 except ReExtractInfo as e:
1363 if e.expected:
1364 self.to_screen(f'{e}; Re-extracting data')
1365 else:
1366 self.to_stderr('\r')
1367 self.report_warning(f'{e}; Re-extracting data')
1368 return wrapper(self, *args, **kwargs)
1369 except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
1370 raise
1371 except Exception as e:
1372 if self.params.get('ignoreerrors'):
1373 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1374 else:
1375 raise
1376 return wrapper
1377
1378 def _wait_for_video(self, ie_result):
1379 if (not self.params.get('wait_for_video')
1380 or ie_result.get('_type', 'video') != 'video'
1381 or ie_result.get('formats') or ie_result.get('url')):
1382 return
1383
1384 format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
1385 last_msg = ''
1386
1387 def progress(msg):
1388 nonlocal last_msg
1389 self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True)
1390 last_msg = msg
1391
1392 min_wait, max_wait = self.params.get('wait_for_video')
1393 diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
1394 if diff is None and ie_result.get('live_status') == 'is_upcoming':
1395 diff = random.randrange(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait)
1396 self.report_warning('Release time of video is not known')
1397 elif (diff or 0) <= 0:
1398 self.report_warning('Video should already be available according to extracted info')
1399 diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
1400 self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
1401
1402 wait_till = time.time() + diff
1403 try:
1404 while True:
1405 diff = wait_till - time.time()
1406 if diff <= 0:
1407 progress('')
1408 raise ReExtractInfo('[wait] Wait period ended', expected=True)
1409 progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
1410 time.sleep(1)
1411 except KeyboardInterrupt:
1412 progress('')
1413 raise ReExtractInfo('[wait] Interrupted by user', expected=True)
1414 except BaseException as e:
1415 if not isinstance(e, ReExtractInfo):
1416 self.to_screen('')
1417 raise
1418
1419 @__handle_extraction_exceptions
1420 def __extract_info(self, url, ie, download, extra_info, process):
1421 ie_result = ie.extract(url)
1422 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1423 return
1424 if isinstance(ie_result, list):
1425 # Backwards compatibility: old IE result format
1426 ie_result = {
1427 '_type': 'compat_list',
1428 'entries': ie_result,
1429 }
1430 if extra_info.get('original_url'):
1431 ie_result.setdefault('original_url', extra_info['original_url'])
1432 self.add_default_extra_info(ie_result, ie, url)
1433 if process:
1434 self._wait_for_video(ie_result)
1435 return self.process_ie_result(ie_result, download, extra_info)
1436 else:
1437 return ie_result
1438
1439 def add_default_extra_info(self, ie_result, ie, url):
1440 if url is not None:
1441 self.add_extra_info(ie_result, {
1442 'webpage_url': url,
1443 'original_url': url,
1444 'webpage_url_basename': url_basename(url),
1445 'webpage_url_domain': get_domain(url),
1446 })
1447 if ie is not None:
1448 self.add_extra_info(ie_result, {
1449 'extractor': ie.IE_NAME,
1450 'extractor_key': ie.ie_key(),
1451 })
1452
1453 def process_ie_result(self, ie_result, download=True, extra_info=None):
1454 """
1455 Take the result of the ie(may be modified) and resolve all unresolved
1456 references (URLs, playlist items).
1457
1458 It will also download the videos if 'download'.
1459 Returns the resolved ie_result.
1460 """
1461 if extra_info is None:
1462 extra_info = {}
1463 result_type = ie_result.get('_type', 'video')
1464
1465 if result_type in ('url', 'url_transparent'):
1466 ie_result['url'] = sanitize_url(ie_result['url'])
1467 if ie_result.get('original_url'):
1468 extra_info.setdefault('original_url', ie_result['original_url'])
1469
1470 extract_flat = self.params.get('extract_flat', False)
1471 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1472 or extract_flat is True):
1473 info_copy = ie_result.copy()
1474 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1475 if ie and not ie_result.get('id'):
1476 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1477 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1478 self.add_extra_info(info_copy, extra_info)
1479 info_copy, _ = self.pre_process(info_copy)
1480 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1481 if self.params.get('force_write_download_archive', False):
1482 self.record_download_archive(info_copy)
1483 return ie_result
1484
1485 if result_type == 'video':
1486 self.add_extra_info(ie_result, extra_info)
1487 ie_result = self.process_video_result(ie_result, download=download)
1488 additional_urls = (ie_result or {}).get('additional_urls')
1489 if additional_urls:
1490 # TODO: Improve MetadataParserPP to allow setting a list
1491 if isinstance(additional_urls, compat_str):
1492 additional_urls = [additional_urls]
1493 self.to_screen(
1494 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1495 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1496 ie_result['additional_entries'] = [
1497 self.extract_info(
1498 url, download, extra_info=extra_info,
1499 force_generic_extractor=self.params.get('force_generic_extractor'))
1500 for url in additional_urls
1501 ]
1502 return ie_result
1503 elif result_type == 'url':
1504 # We have to add extra_info to the results because it may be
1505 # contained in a playlist
1506 return self.extract_info(
1507 ie_result['url'], download,
1508 ie_key=ie_result.get('ie_key'),
1509 extra_info=extra_info)
1510 elif result_type == 'url_transparent':
1511 # Use the information from the embedding page
1512 info = self.extract_info(
1513 ie_result['url'], ie_key=ie_result.get('ie_key'),
1514 extra_info=extra_info, download=False, process=False)
1515
1516 # extract_info may return None when ignoreerrors is enabled and
1517 # extraction failed with an error, don't crash and return early
1518 # in this case
1519 if not info:
1520 return info
1521
1522 force_properties = dict(
1523 (k, v) for k, v in ie_result.items() if v is not None)
1524 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1525 if f in force_properties:
1526 del force_properties[f]
1527 new_result = info.copy()
1528 new_result.update(force_properties)
1529
1530 # Extracted info may not be a video result (i.e.
1531 # info.get('_type', 'video') != video) but rather an url or
1532 # url_transparent. In such cases outer metadata (from ie_result)
1533 # should be propagated to inner one (info). For this to happen
1534 # _type of info should be overridden with url_transparent. This
1535 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1536 if new_result.get('_type') == 'url':
1537 new_result['_type'] = 'url_transparent'
1538
1539 return self.process_ie_result(
1540 new_result, download=download, extra_info=extra_info)
1541 elif result_type in ('playlist', 'multi_video'):
1542 # Protect from infinite recursion due to recursively nested playlists
1543 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1544 webpage_url = ie_result['webpage_url']
1545 if webpage_url in self._playlist_urls:
1546 self.to_screen(
1547 '[download] Skipping already downloaded playlist: %s'
1548 % ie_result.get('title') or ie_result.get('id'))
1549 return
1550
1551 self._playlist_level += 1
1552 self._playlist_urls.add(webpage_url)
1553 self._sanitize_thumbnails(ie_result)
1554 try:
1555 return self.__process_playlist(ie_result, download)
1556 finally:
1557 self._playlist_level -= 1
1558 if not self._playlist_level:
1559 self._playlist_urls.clear()
1560 elif result_type == 'compat_list':
1561 self.report_warning(
1562 'Extractor %s returned a compat_list result. '
1563 'It needs to be updated.' % ie_result.get('extractor'))
1564
1565 def _fixup(r):
1566 self.add_extra_info(r, {
1567 'extractor': ie_result['extractor'],
1568 'webpage_url': ie_result['webpage_url'],
1569 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1570 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1571 'extractor_key': ie_result['extractor_key'],
1572 })
1573 return r
1574 ie_result['entries'] = [
1575 self.process_ie_result(_fixup(r), download, extra_info)
1576 for r in ie_result['entries']
1577 ]
1578 return ie_result
1579 else:
1580 raise Exception('Invalid result type: %s' % result_type)
1581
1582 def _ensure_dir_exists(self, path):
1583 return make_dir(path, self.report_error)
1584
1585 def __process_playlist(self, ie_result, download):
1586 # We process each entry in the playlist
1587 playlist = ie_result.get('title') or ie_result.get('id')
1588 self.to_screen('[download] Downloading playlist: %s' % playlist)
1589
1590 if 'entries' not in ie_result:
1591 raise EntryNotInPlaylist('There are no entries')
1592
1593 MissingEntry = object()
1594 incomplete_entries = bool(ie_result.get('requested_entries'))
1595 if incomplete_entries:
1596 def fill_missing_entries(entries, indices):
1597 ret = [MissingEntry] * max(indices)
1598 for i, entry in zip(indices, entries):
1599 ret[i - 1] = entry
1600 return ret
1601 ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1602
1603 playlist_results = []
1604
1605 playliststart = self.params.get('playliststart', 1)
1606 playlistend = self.params.get('playlistend')
1607 # For backwards compatibility, interpret -1 as whole list
1608 if playlistend == -1:
1609 playlistend = None
1610
1611 playlistitems_str = self.params.get('playlist_items')
1612 playlistitems = None
1613 if playlistitems_str is not None:
1614 def iter_playlistitems(format):
1615 for string_segment in format.split(','):
1616 if '-' in string_segment:
1617 start, end = string_segment.split('-')
1618 for item in range(int(start), int(end) + 1):
1619 yield int(item)
1620 else:
1621 yield int(string_segment)
1622 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1623
1624 ie_entries = ie_result['entries']
1625 msg = (
1626 'Downloading %d videos' if not isinstance(ie_entries, list)
1627 else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1628
1629 if isinstance(ie_entries, list):
1630 def get_entry(i):
1631 return ie_entries[i - 1]
1632 else:
1633 if not isinstance(ie_entries, (PagedList, LazyList)):
1634 ie_entries = LazyList(ie_entries)
1635
1636 def get_entry(i):
1637 return YoutubeDL.__handle_extraction_exceptions(
1638 lambda self, i: ie_entries[i - 1]
1639 )(self, i)
1640
1641 entries = []
1642 items = playlistitems if playlistitems is not None else itertools.count(playliststart)
1643 for i in items:
1644 if i == 0:
1645 continue
1646 if playlistitems is None and playlistend is not None and playlistend < i:
1647 break
1648 entry = None
1649 try:
1650 entry = get_entry(i)
1651 if entry is MissingEntry:
1652 raise EntryNotInPlaylist()
1653 except (IndexError, EntryNotInPlaylist):
1654 if incomplete_entries:
1655 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
1656 elif not playlistitems:
1657 break
1658 entries.append(entry)
1659 try:
1660 if entry is not None:
1661 self._match_entry(entry, incomplete=True, silent=True)
1662 except (ExistingVideoReached, RejectedVideoReached):
1663 break
1664 ie_result['entries'] = entries
1665
1666 # Save playlist_index before re-ordering
1667 entries = [
1668 ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry)
1669 for i, entry in enumerate(entries, 1)
1670 if entry is not None]
1671 n_entries = len(entries)
1672
1673 if not playlistitems and (playliststart != 1 or playlistend):
1674 playlistitems = list(range(playliststart, playliststart + n_entries))
1675 ie_result['requested_entries'] = playlistitems
1676
1677 _infojson_written = False
1678 if not self.params.get('simulate') and self.params.get('allow_playlist_files', True):
1679 ie_copy = {
1680 'playlist': playlist,
1681 'playlist_id': ie_result.get('id'),
1682 'playlist_title': ie_result.get('title'),
1683 'playlist_uploader': ie_result.get('uploader'),
1684 'playlist_uploader_id': ie_result.get('uploader_id'),
1685 'playlist_index': 0,
1686 'n_entries': n_entries,
1687 }
1688 ie_copy.update(dict(ie_result))
1689
1690 _infojson_written = self._write_info_json(
1691 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
1692 if _infojson_written is None:
1693 return
1694 if self._write_description('playlist', ie_result,
1695 self.prepare_filename(ie_copy, 'pl_description')) is None:
1696 return
1697 # TODO: This should be passed to ThumbnailsConvertor if necessary
1698 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1699
1700 if self.params.get('playlistreverse', False):
1701 entries = entries[::-1]
1702 if self.params.get('playlistrandom', False):
1703 random.shuffle(entries)
1704
1705 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1706
1707 self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1708 failures = 0
1709 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1710 for i, entry_tuple in enumerate(entries, 1):
1711 playlist_index, entry = entry_tuple
1712 if 'playlist-index' in self.params.get('compat_opts', []):
1713 playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1
1714 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1715 # This __x_forwarded_for_ip thing is a bit ugly but requires
1716 # minimal changes
1717 if x_forwarded_for:
1718 entry['__x_forwarded_for_ip'] = x_forwarded_for
1719 extra = {
1720 'n_entries': n_entries,
1721 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1722 'playlist_index': playlist_index,
1723 'playlist_autonumber': i,
1724 'playlist': playlist,
1725 'playlist_id': ie_result.get('id'),
1726 'playlist_title': ie_result.get('title'),
1727 'playlist_uploader': ie_result.get('uploader'),
1728 'playlist_uploader_id': ie_result.get('uploader_id'),
1729 'extractor': ie_result['extractor'],
1730 'webpage_url': ie_result['webpage_url'],
1731 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1732 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1733 'extractor_key': ie_result['extractor_key'],
1734 }
1735
1736 if self._match_entry(entry, incomplete=True) is not None:
1737 continue
1738
1739 entry_result = self.__process_iterable_entry(entry, download, extra)
1740 if not entry_result:
1741 failures += 1
1742 if failures >= max_failures:
1743 self.report_error(
1744 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1745 break
1746 playlist_results.append(entry_result)
1747 ie_result['entries'] = playlist_results
1748
1749 # Write the updated info to json
1750 if _infojson_written and self._write_info_json(
1751 'updated playlist', ie_result,
1752 self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
1753 return
1754 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1755 return ie_result
1756
1757 @__handle_extraction_exceptions
1758 def __process_iterable_entry(self, entry, download, extra_info):
1759 return self.process_ie_result(
1760 entry, download=download, extra_info=extra_info)
1761
1762 def _build_format_filter(self, filter_spec):
1763 " Returns a function to filter the formats according to the filter_spec "
1764
1765 OPERATORS = {
1766 '<': operator.lt,
1767 '<=': operator.le,
1768 '>': operator.gt,
1769 '>=': operator.ge,
1770 '=': operator.eq,
1771 '!=': operator.ne,
1772 }
1773 operator_rex = re.compile(r'''(?x)\s*
1774 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1775 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1776 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1777 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1778 m = operator_rex.fullmatch(filter_spec)
1779 if m:
1780 try:
1781 comparison_value = int(m.group('value'))
1782 except ValueError:
1783 comparison_value = parse_filesize(m.group('value'))
1784 if comparison_value is None:
1785 comparison_value = parse_filesize(m.group('value') + 'B')
1786 if comparison_value is None:
1787 raise ValueError(
1788 'Invalid value %r in format specification %r' % (
1789 m.group('value'), filter_spec))
1790 op = OPERATORS[m.group('op')]
1791
1792 if not m:
1793 STR_OPERATORS = {
1794 '=': operator.eq,
1795 '^=': lambda attr, value: attr.startswith(value),
1796 '$=': lambda attr, value: attr.endswith(value),
1797 '*=': lambda attr, value: value in attr,
1798 }
1799 str_operator_rex = re.compile(r'''(?x)\s*
1800 (?P<key>[a-zA-Z0-9._-]+)\s*
1801 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1802 (?P<value>[a-zA-Z0-9._-]+)\s*
1803 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1804 m = str_operator_rex.fullmatch(filter_spec)
1805 if m:
1806 comparison_value = m.group('value')
1807 str_op = STR_OPERATORS[m.group('op')]
1808 if m.group('negation'):
1809 op = lambda attr, value: not str_op(attr, value)
1810 else:
1811 op = str_op
1812
1813 if not m:
1814 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1815
1816 def _filter(f):
1817 actual_value = f.get(m.group('key'))
1818 if actual_value is None:
1819 return m.group('none_inclusive')
1820 return op(actual_value, comparison_value)
1821 return _filter
1822
1823 def _check_formats(self, formats):
1824 for f in formats:
1825 self.to_screen('[info] Testing format %s' % f['format_id'])
1826 path = self.get_output_path('temp')
1827 if not self._ensure_dir_exists(f'{path}/'):
1828 continue
1829 temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
1830 temp_file.close()
1831 try:
1832 success, _ = self.dl(temp_file.name, f, test=True)
1833 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1834 success = False
1835 finally:
1836 if os.path.exists(temp_file.name):
1837 try:
1838 os.remove(temp_file.name)
1839 except OSError:
1840 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1841 if success:
1842 yield f
1843 else:
1844 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1845
1846 def _default_format_spec(self, info_dict, download=True):
1847
1848 def can_merge():
1849 merger = FFmpegMergerPP(self)
1850 return merger.available and merger.can_merge()
1851
1852 prefer_best = (
1853 not self.params.get('simulate')
1854 and download
1855 and (
1856 not can_merge()
1857 or info_dict.get('is_live', False)
1858 or self.outtmpl_dict['default'] == '-'))
1859 compat = (
1860 prefer_best
1861 or self.params.get('allow_multiple_audio_streams', False)
1862 or 'format-spec' in self.params.get('compat_opts', []))
1863
1864 return (
1865 'best/bestvideo+bestaudio' if prefer_best
1866 else 'bestvideo*+bestaudio/best' if not compat
1867 else 'bestvideo+bestaudio/best')
1868
1869 def build_format_selector(self, format_spec):
1870 def syntax_error(note, start):
1871 message = (
1872 'Invalid format specification: '
1873 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1874 return SyntaxError(message)
1875
1876 PICKFIRST = 'PICKFIRST'
1877 MERGE = 'MERGE'
1878 SINGLE = 'SINGLE'
1879 GROUP = 'GROUP'
1880 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1881
1882 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1883 'video': self.params.get('allow_multiple_video_streams', False)}
1884
1885 check_formats = self.params.get('check_formats') == 'selected'
1886
1887 def _parse_filter(tokens):
1888 filter_parts = []
1889 for type, string, start, _, _ in tokens:
1890 if type == tokenize.OP and string == ']':
1891 return ''.join(filter_parts)
1892 else:
1893 filter_parts.append(string)
1894
1895 def _remove_unused_ops(tokens):
1896 # Remove operators that we don't use and join them with the surrounding strings
1897 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1898 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1899 last_string, last_start, last_end, last_line = None, None, None, None
1900 for type, string, start, end, line in tokens:
1901 if type == tokenize.OP and string == '[':
1902 if last_string:
1903 yield tokenize.NAME, last_string, last_start, last_end, last_line
1904 last_string = None
1905 yield type, string, start, end, line
1906 # everything inside brackets will be handled by _parse_filter
1907 for type, string, start, end, line in tokens:
1908 yield type, string, start, end, line
1909 if type == tokenize.OP and string == ']':
1910 break
1911 elif type == tokenize.OP and string in ALLOWED_OPS:
1912 if last_string:
1913 yield tokenize.NAME, last_string, last_start, last_end, last_line
1914 last_string = None
1915 yield type, string, start, end, line
1916 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1917 if not last_string:
1918 last_string = string
1919 last_start = start
1920 last_end = end
1921 else:
1922 last_string += string
1923 if last_string:
1924 yield tokenize.NAME, last_string, last_start, last_end, last_line
1925
1926 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1927 selectors = []
1928 current_selector = None
1929 for type, string, start, _, _ in tokens:
1930 # ENCODING is only defined in python 3.x
1931 if type == getattr(tokenize, 'ENCODING', None):
1932 continue
1933 elif type in [tokenize.NAME, tokenize.NUMBER]:
1934 current_selector = FormatSelector(SINGLE, string, [])
1935 elif type == tokenize.OP:
1936 if string == ')':
1937 if not inside_group:
1938 # ')' will be handled by the parentheses group
1939 tokens.restore_last_token()
1940 break
1941 elif inside_merge and string in ['/', ',']:
1942 tokens.restore_last_token()
1943 break
1944 elif inside_choice and string == ',':
1945 tokens.restore_last_token()
1946 break
1947 elif string == ',':
1948 if not current_selector:
1949 raise syntax_error('"," must follow a format selector', start)
1950 selectors.append(current_selector)
1951 current_selector = None
1952 elif string == '/':
1953 if not current_selector:
1954 raise syntax_error('"/" must follow a format selector', start)
1955 first_choice = current_selector
1956 second_choice = _parse_format_selection(tokens, inside_choice=True)
1957 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1958 elif string == '[':
1959 if not current_selector:
1960 current_selector = FormatSelector(SINGLE, 'best', [])
1961 format_filter = _parse_filter(tokens)
1962 current_selector.filters.append(format_filter)
1963 elif string == '(':
1964 if current_selector:
1965 raise syntax_error('Unexpected "("', start)
1966 group = _parse_format_selection(tokens, inside_group=True)
1967 current_selector = FormatSelector(GROUP, group, [])
1968 elif string == '+':
1969 if not current_selector:
1970 raise syntax_error('Unexpected "+"', start)
1971 selector_1 = current_selector
1972 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1973 if not selector_2:
1974 raise syntax_error('Expected a selector', start)
1975 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1976 else:
1977 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1978 elif type == tokenize.ENDMARKER:
1979 break
1980 if current_selector:
1981 selectors.append(current_selector)
1982 return selectors
1983
1984 def _merge(formats_pair):
1985 format_1, format_2 = formats_pair
1986
1987 formats_info = []
1988 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1989 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1990
1991 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1992 get_no_more = {'video': False, 'audio': False}
1993 for (i, fmt_info) in enumerate(formats_info):
1994 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1995 formats_info.pop(i)
1996 continue
1997 for aud_vid in ['audio', 'video']:
1998 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1999 if get_no_more[aud_vid]:
2000 formats_info.pop(i)
2001 break
2002 get_no_more[aud_vid] = True
2003
2004 if len(formats_info) == 1:
2005 return formats_info[0]
2006
2007 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
2008 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
2009
2010 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
2011 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
2012
2013 output_ext = self.params.get('merge_output_format')
2014 if not output_ext:
2015 if the_only_video:
2016 output_ext = the_only_video['ext']
2017 elif the_only_audio and not video_fmts:
2018 output_ext = the_only_audio['ext']
2019 else:
2020 output_ext = 'mkv'
2021
2022 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
2023
2024 new_dict = {
2025 'requested_formats': formats_info,
2026 'format': '+'.join(filtered('format')),
2027 'format_id': '+'.join(filtered('format_id')),
2028 'ext': output_ext,
2029 'protocol': '+'.join(map(determine_protocol, formats_info)),
2030 'language': '+'.join(orderedSet(filtered('language'))) or None,
2031 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
2032 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
2033 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
2034 }
2035
2036 if the_only_video:
2037 new_dict.update({
2038 'width': the_only_video.get('width'),
2039 'height': the_only_video.get('height'),
2040 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
2041 'fps': the_only_video.get('fps'),
2042 'dynamic_range': the_only_video.get('dynamic_range'),
2043 'vcodec': the_only_video.get('vcodec'),
2044 'vbr': the_only_video.get('vbr'),
2045 'stretched_ratio': the_only_video.get('stretched_ratio'),
2046 })
2047
2048 if the_only_audio:
2049 new_dict.update({
2050 'acodec': the_only_audio.get('acodec'),
2051 'abr': the_only_audio.get('abr'),
2052 'asr': the_only_audio.get('asr'),
2053 })
2054
2055 return new_dict
2056
2057 def _check_formats(formats):
2058 if not check_formats:
2059 yield from formats
2060 return
2061 yield from self._check_formats(formats)
2062
2063 def _build_selector_function(selector):
2064 if isinstance(selector, list): # ,
2065 fs = [_build_selector_function(s) for s in selector]
2066
2067 def selector_function(ctx):
2068 for f in fs:
2069 yield from f(ctx)
2070 return selector_function
2071
2072 elif selector.type == GROUP: # ()
2073 selector_function = _build_selector_function(selector.selector)
2074
2075 elif selector.type == PICKFIRST: # /
2076 fs = [_build_selector_function(s) for s in selector.selector]
2077
2078 def selector_function(ctx):
2079 for f in fs:
2080 picked_formats = list(f(ctx))
2081 if picked_formats:
2082 return picked_formats
2083 return []
2084
2085 elif selector.type == MERGE: # +
2086 selector_1, selector_2 = map(_build_selector_function, selector.selector)
2087
2088 def selector_function(ctx):
2089 for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
2090 yield _merge(pair)
2091
2092 elif selector.type == SINGLE: # atom
2093 format_spec = selector.selector or 'best'
2094
2095 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2096 if format_spec == 'all':
2097 def selector_function(ctx):
2098 yield from _check_formats(ctx['formats'][::-1])
2099 elif format_spec == 'mergeall':
2100 def selector_function(ctx):
2101 formats = list(_check_formats(ctx['formats']))
2102 if not formats:
2103 return
2104 merged_format = formats[-1]
2105 for f in formats[-2::-1]:
2106 merged_format = _merge((merged_format, f))
2107 yield merged_format
2108
2109 else:
2110 format_fallback, format_reverse, format_idx = False, True, 1
2111 mobj = re.match(
2112 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2113 format_spec)
2114 if mobj is not None:
2115 format_idx = int_or_none(mobj.group('n'), default=1)
2116 format_reverse = mobj.group('bw')[0] == 'b'
2117 format_type = (mobj.group('type') or [None])[0]
2118 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2119 format_modified = mobj.group('mod') is not None
2120
2121 format_fallback = not format_type and not format_modified # for b, w
2122 _filter_f = (
2123 (lambda f: f.get('%scodec' % format_type) != 'none')
2124 if format_type and format_modified # bv*, ba*, wv*, wa*
2125 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
2126 if format_type # bv, ba, wv, wa
2127 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2128 if not format_modified # b, w
2129 else lambda f: True) # b*, w*
2130 filter_f = lambda f: _filter_f(f) and (
2131 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2132 else:
2133 if format_spec in self._format_selection_exts['audio']:
2134 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2135 elif format_spec in self._format_selection_exts['video']:
2136 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2137 elif format_spec in self._format_selection_exts['storyboards']:
2138 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2139 else:
2140 filter_f = lambda f: f.get('format_id') == format_spec # id
2141
2142 def selector_function(ctx):
2143 formats = list(ctx['formats'])
2144 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2145 if format_fallback and ctx['incomplete_formats'] and not matches:
2146 # for extractors with incomplete formats (audio only (soundcloud)
2147 # or video only (imgur)) best/worst will fallback to
2148 # best/worst {video,audio}-only format
2149 matches = formats
2150 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2151 try:
2152 yield matches[format_idx - 1]
2153 except IndexError:
2154 return
2155
2156 filters = [self._build_format_filter(f) for f in selector.filters]
2157
2158 def final_selector(ctx):
2159 ctx_copy = dict(ctx)
2160 for _filter in filters:
2161 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2162 return selector_function(ctx_copy)
2163 return final_selector
2164
2165 stream = io.BytesIO(format_spec.encode('utf-8'))
2166 try:
2167 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
2168 except tokenize.TokenError:
2169 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2170
2171 class TokenIterator(object):
2172 def __init__(self, tokens):
2173 self.tokens = tokens
2174 self.counter = 0
2175
2176 def __iter__(self):
2177 return self
2178
2179 def __next__(self):
2180 if self.counter >= len(self.tokens):
2181 raise StopIteration()
2182 value = self.tokens[self.counter]
2183 self.counter += 1
2184 return value
2185
2186 next = __next__
2187
2188 def restore_last_token(self):
2189 self.counter -= 1
2190
2191 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2192 return _build_selector_function(parsed_selector)
2193
2194 def _calc_headers(self, info_dict):
2195 res = std_headers.copy()
2196
2197 add_headers = info_dict.get('http_headers')
2198 if add_headers:
2199 res.update(add_headers)
2200
2201 cookies = self._calc_cookies(info_dict)
2202 if cookies:
2203 res['Cookie'] = cookies
2204
2205 if 'X-Forwarded-For' not in res:
2206 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2207 if x_forwarded_for_ip:
2208 res['X-Forwarded-For'] = x_forwarded_for_ip
2209
2210 return res
2211
2212 def _calc_cookies(self, info_dict):
2213 pr = sanitized_Request(info_dict['url'])
2214 self.cookiejar.add_cookie_header(pr)
2215 return pr.get_header('Cookie')
2216
2217 def _sort_thumbnails(self, thumbnails):
2218 thumbnails.sort(key=lambda t: (
2219 t.get('preference') if t.get('preference') is not None else -1,
2220 t.get('width') if t.get('width') is not None else -1,
2221 t.get('height') if t.get('height') is not None else -1,
2222 t.get('id') if t.get('id') is not None else '',
2223 t.get('url')))
2224
2225 def _sanitize_thumbnails(self, info_dict):
2226 thumbnails = info_dict.get('thumbnails')
2227 if thumbnails is None:
2228 thumbnail = info_dict.get('thumbnail')
2229 if thumbnail:
2230 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2231 if not thumbnails:
2232 return
2233
2234 def check_thumbnails(thumbnails):
2235 for t in thumbnails:
2236 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2237 try:
2238 self.urlopen(HEADRequest(t['url']))
2239 except network_exceptions as err:
2240 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2241 continue
2242 yield t
2243
2244 self._sort_thumbnails(thumbnails)
2245 for i, t in enumerate(thumbnails):
2246 if t.get('id') is None:
2247 t['id'] = '%d' % i
2248 if t.get('width') and t.get('height'):
2249 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2250 t['url'] = sanitize_url(t['url'])
2251
2252 if self.params.get('check_formats') is True:
2253 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
2254 else:
2255 info_dict['thumbnails'] = thumbnails
2256
2257 def process_video_result(self, info_dict, download=True):
2258 assert info_dict.get('_type', 'video') == 'video'
2259
2260 if 'id' not in info_dict:
2261 raise ExtractorError('Missing "id" field in extractor result')
2262 if 'title' not in info_dict:
2263 raise ExtractorError('Missing "title" field in extractor result',
2264 video_id=info_dict['id'], ie=info_dict['extractor'])
2265
2266 def report_force_conversion(field, field_not, conversion):
2267 self.report_warning(
2268 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2269 % (field, field_not, conversion))
2270
2271 def sanitize_string_field(info, string_field):
2272 field = info.get(string_field)
2273 if field is None or isinstance(field, compat_str):
2274 return
2275 report_force_conversion(string_field, 'a string', 'string')
2276 info[string_field] = compat_str(field)
2277
2278 def sanitize_numeric_fields(info):
2279 for numeric_field in self._NUMERIC_FIELDS:
2280 field = info.get(numeric_field)
2281 if field is None or isinstance(field, compat_numeric_types):
2282 continue
2283 report_force_conversion(numeric_field, 'numeric', 'int')
2284 info[numeric_field] = int_or_none(field)
2285
2286 sanitize_string_field(info_dict, 'id')
2287 sanitize_numeric_fields(info_dict)
2288
2289 if 'playlist' not in info_dict:
2290 # It isn't part of a playlist
2291 info_dict['playlist'] = None
2292 info_dict['playlist_index'] = None
2293
2294 self._sanitize_thumbnails(info_dict)
2295
2296 thumbnail = info_dict.get('thumbnail')
2297 thumbnails = info_dict.get('thumbnails')
2298 if thumbnail:
2299 info_dict['thumbnail'] = sanitize_url(thumbnail)
2300 elif thumbnails:
2301 info_dict['thumbnail'] = thumbnails[-1]['url']
2302
2303 if info_dict.get('display_id') is None and 'id' in info_dict:
2304 info_dict['display_id'] = info_dict['id']
2305
2306 if info_dict.get('duration') is not None:
2307 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2308
2309 for ts_key, date_key in (
2310 ('timestamp', 'upload_date'),
2311 ('release_timestamp', 'release_date'),
2312 ):
2313 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2314 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2315 # see http://bugs.python.org/issue1646728)
2316 try:
2317 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2318 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2319 except (ValueError, OverflowError, OSError):
2320 pass
2321
2322 live_keys = ('is_live', 'was_live')
2323 live_status = info_dict.get('live_status')
2324 if live_status is None:
2325 for key in live_keys:
2326 if info_dict.get(key) is False:
2327 continue
2328 if info_dict.get(key):
2329 live_status = key
2330 break
2331 if all(info_dict.get(key) is False for key in live_keys):
2332 live_status = 'not_live'
2333 if live_status:
2334 info_dict['live_status'] = live_status
2335 for key in live_keys:
2336 if info_dict.get(key) is None:
2337 info_dict[key] = (live_status == key)
2338
2339 # Auto generate title fields corresponding to the *_number fields when missing
2340 # in order to always have clean titles. This is very common for TV series.
2341 for field in ('chapter', 'season', 'episode'):
2342 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2343 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2344
2345 for cc_kind in ('subtitles', 'automatic_captions'):
2346 cc = info_dict.get(cc_kind)
2347 if cc:
2348 for _, subtitle in cc.items():
2349 for subtitle_format in subtitle:
2350 if subtitle_format.get('url'):
2351 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2352 if subtitle_format.get('ext') is None:
2353 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2354
2355 automatic_captions = info_dict.get('automatic_captions')
2356 subtitles = info_dict.get('subtitles')
2357
2358 info_dict['requested_subtitles'] = self.process_subtitles(
2359 info_dict['id'], subtitles, automatic_captions)
2360
2361 if info_dict.get('formats') is None:
2362 # There's only one format available
2363 formats = [info_dict]
2364 else:
2365 formats = info_dict['formats']
2366
2367 info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
2368 if not self.params.get('allow_unplayable_formats'):
2369 formats = [f for f in formats if not f.get('has_drm')]
2370
2371 if info_dict.get('is_live'):
2372 get_from_start = bool(self.params.get('live_from_start'))
2373 formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
2374
2375 if not formats:
2376 self.raise_no_formats(info_dict)
2377
2378 def is_wellformed(f):
2379 url = f.get('url')
2380 if not url:
2381 self.report_warning(
2382 '"url" field is missing or empty - skipping format, '
2383 'there is an error in extractor')
2384 return False
2385 if isinstance(url, bytes):
2386 sanitize_string_field(f, 'url')
2387 return True
2388
2389 # Filter out malformed formats for better extraction robustness
2390 formats = list(filter(is_wellformed, formats))
2391
2392 formats_dict = {}
2393
2394 # We check that all the formats have the format and format_id fields
2395 for i, format in enumerate(formats):
2396 sanitize_string_field(format, 'format_id')
2397 sanitize_numeric_fields(format)
2398 format['url'] = sanitize_url(format['url'])
2399 if not format.get('format_id'):
2400 format['format_id'] = compat_str(i)
2401 else:
2402 # Sanitize format_id from characters used in format selector expression
2403 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2404 format_id = format['format_id']
2405 if format_id not in formats_dict:
2406 formats_dict[format_id] = []
2407 formats_dict[format_id].append(format)
2408
2409 # Make sure all formats have unique format_id
2410 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2411 for format_id, ambiguous_formats in formats_dict.items():
2412 ambigious_id = len(ambiguous_formats) > 1
2413 for i, format in enumerate(ambiguous_formats):
2414 if ambigious_id:
2415 format['format_id'] = '%s-%d' % (format_id, i)
2416 if format.get('ext') is None:
2417 format['ext'] = determine_ext(format['url']).lower()
2418 # Ensure there is no conflict between id and ext in format selection
2419 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2420 if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
2421 format['format_id'] = 'f%s' % format['format_id']
2422
2423 for i, format in enumerate(formats):
2424 if format.get('format') is None:
2425 format['format'] = '{id} - {res}{note}'.format(
2426 id=format['format_id'],
2427 res=self.format_resolution(format),
2428 note=format_field(format, 'format_note', ' (%s)'),
2429 )
2430 if format.get('protocol') is None:
2431 format['protocol'] = determine_protocol(format)
2432 if format.get('resolution') is None:
2433 format['resolution'] = self.format_resolution(format, default=None)
2434 if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
2435 format['dynamic_range'] = 'SDR'
2436 if (info_dict.get('duration') and format.get('tbr')
2437 and not format.get('filesize') and not format.get('filesize_approx')):
2438 format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8)
2439
2440 # Add HTTP headers, so that external programs can use them from the
2441 # json output
2442 full_format_info = info_dict.copy()
2443 full_format_info.update(format)
2444 format['http_headers'] = self._calc_headers(full_format_info)
2445 # Remove private housekeeping stuff
2446 if '__x_forwarded_for_ip' in info_dict:
2447 del info_dict['__x_forwarded_for_ip']
2448
2449 # TODO Central sorting goes here
2450
2451 if self.params.get('check_formats') is True:
2452 formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
2453
2454 if not formats or formats[0] is not info_dict:
2455 # only set the 'formats' fields if the original info_dict list them
2456 # otherwise we end up with a circular reference, the first (and unique)
2457 # element in the 'formats' field in info_dict is info_dict itself,
2458 # which can't be exported to json
2459 info_dict['formats'] = formats
2460
2461 info_dict, _ = self.pre_process(info_dict)
2462
2463 # The pre-processors may have modified the formats
2464 formats = info_dict.get('formats', [info_dict])
2465
2466 list_only = self.params.get('simulate') is None and (
2467 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2468 interactive_format_selection = not list_only and self.format_selector == '-'
2469 if self.params.get('list_thumbnails'):
2470 self.list_thumbnails(info_dict)
2471 if self.params.get('listsubtitles'):
2472 if 'automatic_captions' in info_dict:
2473 self.list_subtitles(
2474 info_dict['id'], automatic_captions, 'automatic captions')
2475 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2476 if self.params.get('listformats') or interactive_format_selection:
2477 self.list_formats(info_dict)
2478 if list_only:
2479 # Without this printing, -F --print-json will not work
2480 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2481 return
2482
2483 format_selector = self.format_selector
2484 if format_selector is None:
2485 req_format = self._default_format_spec(info_dict, download=download)
2486 self.write_debug('Default format spec: %s' % req_format)
2487 format_selector = self.build_format_selector(req_format)
2488
2489 while True:
2490 if interactive_format_selection:
2491 req_format = input(
2492 self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS))
2493 try:
2494 format_selector = self.build_format_selector(req_format)
2495 except SyntaxError as err:
2496 self.report_error(err, tb=False, is_error=False)
2497 continue
2498
2499 # While in format selection we may need to have an access to the original
2500 # format set in order to calculate some metrics or do some processing.
2501 # For now we need to be able to guess whether original formats provided
2502 # by extractor are incomplete or not (i.e. whether extractor provides only
2503 # video-only or audio-only formats) for proper formats selection for
2504 # extractors with such incomplete formats (see
2505 # https://github.com/ytdl-org/youtube-dl/pull/5556).
2506 # Since formats may be filtered during format selection and may not match
2507 # the original formats the results may be incorrect. Thus original formats
2508 # or pre-calculated metrics should be passed to format selection routines
2509 # as well.
2510 # We will pass a context object containing all necessary additional data
2511 # instead of just formats.
2512 # This fixes incorrect format selection issue (see
2513 # https://github.com/ytdl-org/youtube-dl/issues/10083).
2514 incomplete_formats = (
2515 # All formats are video-only or
2516 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2517 # all formats are audio-only
2518 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2519
2520 ctx = {
2521 'formats': formats,
2522 'incomplete_formats': incomplete_formats,
2523 }
2524
2525 formats_to_download = list(format_selector(ctx))
2526 if interactive_format_selection and not formats_to_download:
2527 self.report_error('Requested format is not available', tb=False, is_error=False)
2528 continue
2529 break
2530
2531 if not formats_to_download:
2532 if not self.params.get('ignore_no_formats_error'):
2533 raise ExtractorError('Requested format is not available', expected=True,
2534 video_id=info_dict['id'], ie=info_dict['extractor'])
2535 else:
2536 self.report_warning('Requested format is not available')
2537 # Process what we can, even without any available formats.
2538 self.process_info(dict(info_dict))
2539 elif download:
2540 self.to_screen(
2541 '[info] %s: Downloading %d format(s): %s' % (
2542 info_dict['id'], len(formats_to_download),
2543 ", ".join([f['format_id'] for f in formats_to_download])))
2544 for fmt in formats_to_download:
2545 new_info = dict(info_dict)
2546 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2547 new_info['__original_infodict'] = info_dict
2548 new_info.update(fmt)
2549 self.process_info(new_info)
2550 # We update the info dict with the selected best quality format (backwards compatibility)
2551 if formats_to_download:
2552 info_dict.update(formats_to_download[-1])
2553 return info_dict
2554
2555 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2556 """Select the requested subtitles and their format"""
2557 available_subs = {}
2558 if normal_subtitles and self.params.get('writesubtitles'):
2559 available_subs.update(normal_subtitles)
2560 if automatic_captions and self.params.get('writeautomaticsub'):
2561 for lang, cap_info in automatic_captions.items():
2562 if lang not in available_subs:
2563 available_subs[lang] = cap_info
2564
2565 if (not self.params.get('writesubtitles') and not
2566 self.params.get('writeautomaticsub') or not
2567 available_subs):
2568 return None
2569
2570 all_sub_langs = available_subs.keys()
2571 if self.params.get('allsubtitles', False):
2572 requested_langs = all_sub_langs
2573 elif self.params.get('subtitleslangs', False):
2574 # A list is used so that the order of languages will be the same as
2575 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2576 requested_langs = []
2577 for lang_re in self.params.get('subtitleslangs'):
2578 if lang_re == 'all':
2579 requested_langs.extend(all_sub_langs)
2580 continue
2581 discard = lang_re[0] == '-'
2582 if discard:
2583 lang_re = lang_re[1:]
2584 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2585 if discard:
2586 for lang in current_langs:
2587 while lang in requested_langs:
2588 requested_langs.remove(lang)
2589 else:
2590 requested_langs.extend(current_langs)
2591 requested_langs = orderedSet(requested_langs)
2592 elif 'en' in available_subs:
2593 requested_langs = ['en']
2594 else:
2595 requested_langs = [list(all_sub_langs)[0]]
2596 if requested_langs:
2597 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2598
2599 formats_query = self.params.get('subtitlesformat', 'best')
2600 formats_preference = formats_query.split('/') if formats_query else []
2601 subs = {}
2602 for lang in requested_langs:
2603 formats = available_subs.get(lang)
2604 if formats is None:
2605 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2606 continue
2607 for ext in formats_preference:
2608 if ext == 'best':
2609 f = formats[-1]
2610 break
2611 matches = list(filter(lambda f: f['ext'] == ext, formats))
2612 if matches:
2613 f = matches[-1]
2614 break
2615 else:
2616 f = formats[-1]
2617 self.report_warning(
2618 'No subtitle format found matching "%s" for language %s, '
2619 'using %s' % (formats_query, lang, f['ext']))
2620 subs[lang] = f
2621 return subs
2622
2623 def __forced_printings(self, info_dict, filename, incomplete):
2624 def print_mandatory(field, actual_field=None):
2625 if actual_field is None:
2626 actual_field = field
2627 if (self.params.get('force%s' % field, False)
2628 and (not incomplete or info_dict.get(actual_field) is not None)):
2629 self.to_stdout(info_dict[actual_field])
2630
2631 def print_optional(field):
2632 if (self.params.get('force%s' % field, False)
2633 and info_dict.get(field) is not None):
2634 self.to_stdout(info_dict[field])
2635
2636 info_dict = info_dict.copy()
2637 if filename is not None:
2638 info_dict['filename'] = filename
2639 if info_dict.get('requested_formats') is not None:
2640 # For RTMP URLs, also include the playpath
2641 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2642 elif 'url' in info_dict:
2643 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2644
2645 if self.params.get('forceprint') or self.params.get('forcejson'):
2646 self.post_extract(info_dict)
2647 for tmpl in self.params.get('forceprint', []):
2648 mobj = re.match(r'\w+(=?)$', tmpl)
2649 if mobj and mobj.group(1):
2650 tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s'
2651 elif mobj:
2652 tmpl = '%({})s'.format(tmpl)
2653 self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict))
2654
2655 print_mandatory('title')
2656 print_mandatory('id')
2657 print_mandatory('url', 'urls')
2658 print_optional('thumbnail')
2659 print_optional('description')
2660 print_optional('filename')
2661 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2662 self.to_stdout(formatSeconds(info_dict['duration']))
2663 print_mandatory('format')
2664
2665 if self.params.get('forcejson'):
2666 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2667
2668 def dl(self, name, info, subtitle=False, test=False):
2669 if not info.get('url'):
2670 self.raise_no_formats(info, True)
2671
2672 if test:
2673 verbose = self.params.get('verbose')
2674 params = {
2675 'test': True,
2676 'quiet': self.params.get('quiet') or not verbose,
2677 'verbose': verbose,
2678 'noprogress': not verbose,
2679 'nopart': True,
2680 'skip_unavailable_fragments': False,
2681 'keep_fragments': False,
2682 'overwrites': True,
2683 '_no_ytdl_file': True,
2684 }
2685 else:
2686 params = self.params
2687 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2688 if not test:
2689 for ph in self._progress_hooks:
2690 fd.add_progress_hook(ph)
2691 urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2692 self.write_debug('Invoking downloader on "%s"' % urls)
2693
2694 # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
2695 # But it may contain objects that are not deep-copyable
2696 new_info = self._copy_infodict(info)
2697 if new_info.get('http_headers') is None:
2698 new_info['http_headers'] = self._calc_headers(new_info)
2699 return fd.download(name, new_info, subtitle)
2700
2701 def process_info(self, info_dict):
2702 """Process a single resolved IE result."""
2703
2704 assert info_dict.get('_type', 'video') == 'video'
2705
2706 max_downloads = self.params.get('max_downloads')
2707 if max_downloads is not None:
2708 if self._num_downloads >= int(max_downloads):
2709 raise MaxDownloadsReached()
2710
2711 if info_dict.get('is_live') and not self.params.get('live_from_start'):
2712 info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
2713
2714 # TODO: backward compatibility, to be removed
2715 info_dict['fulltitle'] = info_dict['title']
2716
2717 if 'format' not in info_dict and 'ext' in info_dict:
2718 info_dict['format'] = info_dict['ext']
2719
2720 if self._match_entry(info_dict) is not None:
2721 return
2722
2723 self.post_extract(info_dict)
2724 self._num_downloads += 1
2725
2726 # info_dict['_filename'] needs to be set for backward compatibility
2727 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2728 temp_filename = self.prepare_filename(info_dict, 'temp')
2729 files_to_move = {}
2730
2731 # Forced printings
2732 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2733
2734 if self.params.get('simulate'):
2735 if self.params.get('force_write_download_archive', False):
2736 self.record_download_archive(info_dict)
2737 # Do nothing else if in simulate mode
2738 return
2739
2740 if full_filename is None:
2741 return
2742 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2743 return
2744 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2745 return
2746
2747 if self._write_description('video', info_dict,
2748 self.prepare_filename(info_dict, 'description')) is None:
2749 return
2750
2751 sub_files = self._write_subtitles(info_dict, temp_filename)
2752 if sub_files is None:
2753 return
2754 files_to_move.update(dict(sub_files))
2755
2756 thumb_files = self._write_thumbnails(
2757 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2758 if thumb_files is None:
2759 return
2760 files_to_move.update(dict(thumb_files))
2761
2762 infofn = self.prepare_filename(info_dict, 'infojson')
2763 _infojson_written = self._write_info_json('video', info_dict, infofn)
2764 if _infojson_written:
2765 info_dict['infojson_filename'] = infofn
2766 # For backward compatibility, even though it was a private field
2767 info_dict['__infojson_filename'] = infofn
2768 elif _infojson_written is None:
2769 return
2770
2771 # Note: Annotations are deprecated
2772 annofn = None
2773 if self.params.get('writeannotations', False):
2774 annofn = self.prepare_filename(info_dict, 'annotation')
2775 if annofn:
2776 if not self._ensure_dir_exists(encodeFilename(annofn)):
2777 return
2778 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2779 self.to_screen('[info] Video annotations are already present')
2780 elif not info_dict.get('annotations'):
2781 self.report_warning('There are no annotations to write.')
2782 else:
2783 try:
2784 self.to_screen('[info] Writing video annotations to: ' + annofn)
2785 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2786 annofile.write(info_dict['annotations'])
2787 except (KeyError, TypeError):
2788 self.report_warning('There are no annotations to write.')
2789 except (OSError, IOError):
2790 self.report_error('Cannot write annotations file: ' + annofn)
2791 return
2792
2793 # Write internet shortcut files
2794 def _write_link_file(link_type):
2795 if 'webpage_url' not in info_dict:
2796 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2797 return False
2798 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
2799 if not self._ensure_dir_exists(encodeFilename(linkfn)):
2800 return False
2801 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2802 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
2803 return True
2804 try:
2805 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
2806 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
2807 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
2808 template_vars = {'url': iri_to_uri(info_dict['webpage_url'])}
2809 if link_type == 'desktop':
2810 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
2811 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
2812 except (OSError, IOError):
2813 self.report_error(f'Cannot write internet shortcut {linkfn}')
2814 return False
2815 return True
2816
2817 write_links = {
2818 'url': self.params.get('writeurllink'),
2819 'webloc': self.params.get('writewebloclink'),
2820 'desktop': self.params.get('writedesktoplink'),
2821 }
2822 if self.params.get('writelink'):
2823 link_type = ('webloc' if sys.platform == 'darwin'
2824 else 'desktop' if sys.platform.startswith('linux')
2825 else 'url')
2826 write_links[link_type] = True
2827
2828 if any(should_write and not _write_link_file(link_type)
2829 for link_type, should_write in write_links.items()):
2830 return
2831
2832 try:
2833 info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2834 except PostProcessingError as err:
2835 self.report_error('Preprocessing: %s' % str(err))
2836 return
2837
2838 must_record_download_archive = False
2839 if self.params.get('skip_download', False):
2840 info_dict['filepath'] = temp_filename
2841 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2842 info_dict['__files_to_move'] = files_to_move
2843 info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2844 else:
2845 # Download
2846 info_dict.setdefault('__postprocessors', [])
2847 try:
2848
2849 def existing_file(*filepaths):
2850 ext = info_dict.get('ext')
2851 final_ext = self.params.get('final_ext', ext)
2852 existing_files = []
2853 for file in orderedSet(filepaths):
2854 if final_ext != ext:
2855 converted = replace_extension(file, final_ext, ext)
2856 if os.path.exists(encodeFilename(converted)):
2857 existing_files.append(converted)
2858 if os.path.exists(encodeFilename(file)):
2859 existing_files.append(file)
2860
2861 if not existing_files or self.params.get('overwrites', False):
2862 for file in orderedSet(existing_files):
2863 self.report_file_delete(file)
2864 os.remove(encodeFilename(file))
2865 return None
2866
2867 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2868 return existing_files[0]
2869
2870 success = True
2871 if info_dict.get('requested_formats') is not None:
2872
2873 def compatible_formats(formats):
2874 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2875 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2876 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2877 if len(video_formats) > 2 or len(audio_formats) > 2:
2878 return False
2879
2880 # Check extension
2881 exts = set(format.get('ext') for format in formats)
2882 COMPATIBLE_EXTS = (
2883 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2884 set(('webm',)),
2885 )
2886 for ext_sets in COMPATIBLE_EXTS:
2887 if ext_sets.issuperset(exts):
2888 return True
2889 # TODO: Check acodec/vcodec
2890 return False
2891
2892 requested_formats = info_dict['requested_formats']
2893 old_ext = info_dict['ext']
2894 if self.params.get('merge_output_format') is None:
2895 if not compatible_formats(requested_formats):
2896 info_dict['ext'] = 'mkv'
2897 self.report_warning(
2898 'Requested formats are incompatible for merge and will be merged into mkv')
2899 if (info_dict['ext'] == 'webm'
2900 and info_dict.get('thumbnails')
2901 # check with type instead of pp_key, __name__, or isinstance
2902 # since we dont want any custom PPs to trigger this
2903 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])):
2904 info_dict['ext'] = 'mkv'
2905 self.report_warning(
2906 'webm doesn\'t support embedding a thumbnail, mkv will be used')
2907 new_ext = info_dict['ext']
2908
2909 def correct_ext(filename, ext=new_ext):
2910 if filename == '-':
2911 return filename
2912 filename_real_ext = os.path.splitext(filename)[1][1:]
2913 filename_wo_ext = (
2914 os.path.splitext(filename)[0]
2915 if filename_real_ext in (old_ext, new_ext)
2916 else filename)
2917 return '%s.%s' % (filename_wo_ext, ext)
2918
2919 # Ensure filename always has a correct extension for successful merge
2920 full_filename = correct_ext(full_filename)
2921 temp_filename = correct_ext(temp_filename)
2922 dl_filename = existing_file(full_filename, temp_filename)
2923 info_dict['__real_download'] = False
2924
2925 downloaded = []
2926 merger = FFmpegMergerPP(self)
2927
2928 fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
2929 if dl_filename is not None:
2930 self.report_file_already_downloaded(dl_filename)
2931 elif fd:
2932 for f in requested_formats if fd != FFmpegFD else []:
2933 f['filepath'] = fname = prepend_extension(
2934 correct_ext(temp_filename, info_dict['ext']),
2935 'f%s' % f['format_id'], info_dict['ext'])
2936 downloaded.append(fname)
2937 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2938 success, real_download = self.dl(temp_filename, info_dict)
2939 info_dict['__real_download'] = real_download
2940 else:
2941 if self.params.get('allow_unplayable_formats'):
2942 self.report_warning(
2943 'You have requested merging of multiple formats '
2944 'while also allowing unplayable formats to be downloaded. '
2945 'The formats won\'t be merged to prevent data corruption.')
2946 elif not merger.available:
2947 self.report_warning(
2948 'You have requested merging of multiple formats but ffmpeg is not installed. '
2949 'The formats won\'t be merged.')
2950
2951 if temp_filename == '-':
2952 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
2953 else 'but the formats are incompatible for simultaneous download' if merger.available
2954 else 'but ffmpeg is not installed')
2955 self.report_warning(
2956 f'You have requested downloading multiple formats to stdout {reason}. '
2957 'The formats will be streamed one after the other')
2958 fname = temp_filename
2959 for f in requested_formats:
2960 new_info = dict(info_dict)
2961 del new_info['requested_formats']
2962 new_info.update(f)
2963 if temp_filename != '-':
2964 fname = prepend_extension(
2965 correct_ext(temp_filename, new_info['ext']),
2966 'f%s' % f['format_id'], new_info['ext'])
2967 if not self._ensure_dir_exists(fname):
2968 return
2969 f['filepath'] = fname
2970 downloaded.append(fname)
2971 partial_success, real_download = self.dl(fname, new_info)
2972 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2973 success = success and partial_success
2974
2975 if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
2976 info_dict['__postprocessors'].append(merger)
2977 info_dict['__files_to_merge'] = downloaded
2978 # Even if there were no downloads, it is being merged only now
2979 info_dict['__real_download'] = True
2980 else:
2981 for file in downloaded:
2982 files_to_move[file] = None
2983 else:
2984 # Just a single file
2985 dl_filename = existing_file(full_filename, temp_filename)
2986 if dl_filename is None or dl_filename == temp_filename:
2987 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
2988 # So we should try to resume the download
2989 success, real_download = self.dl(temp_filename, info_dict)
2990 info_dict['__real_download'] = real_download
2991 else:
2992 self.report_file_already_downloaded(dl_filename)
2993
2994 dl_filename = dl_filename or temp_filename
2995 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2996
2997 except network_exceptions as err:
2998 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2999 return
3000 except (OSError, IOError) as err:
3001 raise UnavailableVideoError(err)
3002 except (ContentTooShortError, ) as err:
3003 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
3004 return
3005
3006 if success and full_filename != '-':
3007
3008 def fixup():
3009 do_fixup = True
3010 fixup_policy = self.params.get('fixup')
3011 vid = info_dict['id']
3012
3013 if fixup_policy in ('ignore', 'never'):
3014 return
3015 elif fixup_policy == 'warn':
3016 do_fixup = False
3017 elif fixup_policy != 'force':
3018 assert fixup_policy in ('detect_or_warn', None)
3019 if not info_dict.get('__real_download'):
3020 do_fixup = False
3021
3022 def ffmpeg_fixup(cndn, msg, cls):
3023 if not cndn:
3024 return
3025 if not do_fixup:
3026 self.report_warning(f'{vid}: {msg}')
3027 return
3028 pp = cls(self)
3029 if pp.available:
3030 info_dict['__postprocessors'].append(pp)
3031 else:
3032 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
3033
3034 stretched_ratio = info_dict.get('stretched_ratio')
3035 ffmpeg_fixup(
3036 stretched_ratio not in (1, None),
3037 f'Non-uniform pixel ratio {stretched_ratio}',
3038 FFmpegFixupStretchedPP)
3039
3040 ffmpeg_fixup(
3041 (info_dict.get('requested_formats') is None
3042 and info_dict.get('container') == 'm4a_dash'
3043 and info_dict.get('ext') == 'm4a'),
3044 'writing DASH m4a. Only some players support this container',
3045 FFmpegFixupM4aPP)
3046
3047 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
3048 downloader = downloader.__name__ if downloader else None
3049
3050 if info_dict.get('requested_formats') is None: # Not necessary if doing merger
3051 ffmpeg_fixup(downloader == 'HlsFD',
3052 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
3053 FFmpegFixupM3u8PP)
3054 ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
3055 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
3056
3057 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
3058 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed duration detected', FFmpegFixupDurationPP)
3059
3060 fixup()
3061 try:
3062 info_dict = self.post_process(dl_filename, info_dict, files_to_move)
3063 except PostProcessingError as err:
3064 self.report_error('Postprocessing: %s' % str(err))
3065 return
3066 try:
3067 for ph in self._post_hooks:
3068 ph(info_dict['filepath'])
3069 except Exception as err:
3070 self.report_error('post hooks: %s' % str(err))
3071 return
3072 must_record_download_archive = True
3073
3074 if must_record_download_archive or self.params.get('force_write_download_archive', False):
3075 self.record_download_archive(info_dict)
3076 max_downloads = self.params.get('max_downloads')
3077 if max_downloads is not None and self._num_downloads >= int(max_downloads):
3078 raise MaxDownloadsReached()
3079
3080 def __download_wrapper(self, func):
3081 @functools.wraps(func)
3082 def wrapper(*args, **kwargs):
3083 try:
3084 res = func(*args, **kwargs)
3085 except UnavailableVideoError as e:
3086 self.report_error(e)
3087 except MaxDownloadsReached as e:
3088 self.to_screen(f'[info] {e}')
3089 raise
3090 except DownloadCancelled as e:
3091 self.to_screen(f'[info] {e}')
3092 if not self.params.get('break_per_url'):
3093 raise
3094 else:
3095 if self.params.get('dump_single_json', False):
3096 self.post_extract(res)
3097 self.to_stdout(json.dumps(self.sanitize_info(res)))
3098 return wrapper
3099
3100 def download(self, url_list):
3101 """Download a given list of URLs."""
3102 url_list = variadic(url_list) # Passing a single URL is a common mistake
3103 outtmpl = self.outtmpl_dict['default']
3104 if (len(url_list) > 1
3105 and outtmpl != '-'
3106 and '%' not in outtmpl
3107 and self.params.get('max_downloads') != 1):
3108 raise SameFileError(outtmpl)
3109
3110 for url in url_list:
3111 self.__download_wrapper(self.extract_info)(
3112 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
3113
3114 return self._download_retcode
3115
3116 def download_with_info_file(self, info_filename):
3117 with contextlib.closing(fileinput.FileInput(
3118 [info_filename], mode='r',
3119 openhook=fileinput.hook_encoded('utf-8'))) as f:
3120 # FileInput doesn't have a read method, we can't call json.load
3121 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
3122 try:
3123 self.__download_wrapper(self.process_ie_result)(info, download=True)
3124 except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
3125 if not isinstance(e, EntryNotInPlaylist):
3126 self.to_stderr('\r')
3127 webpage_url = info.get('webpage_url')
3128 if webpage_url is not None:
3129 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
3130 return self.download([webpage_url])
3131 else:
3132 raise
3133 return self._download_retcode
3134
3135 @staticmethod
3136 def sanitize_info(info_dict, remove_private_keys=False):
3137 ''' Sanitize the infodict for converting to json '''
3138 if info_dict is None:
3139 return info_dict
3140 info_dict.setdefault('epoch', int(time.time()))
3141 remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
3142 keep_keys = ['_type'] # Always keep this to facilitate load-info-json
3143 if remove_private_keys:
3144 remove_keys |= {
3145 'requested_formats', 'requested_subtitles', 'requested_entries', 'entries',
3146 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
3147 }
3148 empty_values = (None, {}, [], set(), tuple())
3149 reject = lambda k, v: k not in keep_keys and (
3150 k.startswith('_') or k in remove_keys or v in empty_values)
3151 else:
3152 reject = lambda k, v: k in remove_keys
3153
3154 def filter_fn(obj):
3155 if isinstance(obj, dict):
3156 return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
3157 elif isinstance(obj, (list, tuple, set, LazyList)):
3158 return list(map(filter_fn, obj))
3159 elif obj is None or isinstance(obj, (str, int, float, bool)):
3160 return obj
3161 else:
3162 return repr(obj)
3163
3164 return filter_fn(info_dict)
3165
3166 @staticmethod
3167 def filter_requested_info(info_dict, actually_filter=True):
3168 ''' Alias of sanitize_info for backward compatibility '''
3169 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3170
3171 def run_pp(self, pp, infodict):
3172 files_to_delete = []
3173 if '__files_to_move' not in infodict:
3174 infodict['__files_to_move'] = {}
3175 try:
3176 files_to_delete, infodict = pp.run(infodict)
3177 except PostProcessingError as e:
3178 # Must be True and not 'only_download'
3179 if self.params.get('ignoreerrors') is True:
3180 self.report_error(e)
3181 return infodict
3182 raise
3183
3184 if not files_to_delete:
3185 return infodict
3186 if self.params.get('keepvideo', False):
3187 for f in files_to_delete:
3188 infodict['__files_to_move'].setdefault(f, '')
3189 else:
3190 for old_filename in set(files_to_delete):
3191 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
3192 try:
3193 os.remove(encodeFilename(old_filename))
3194 except (IOError, OSError):
3195 self.report_warning('Unable to remove downloaded original file')
3196 if old_filename in infodict['__files_to_move']:
3197 del infodict['__files_to_move'][old_filename]
3198 return infodict
3199
3200 @staticmethod
3201 def post_extract(info_dict):
3202 def actual_post_extract(info_dict):
3203 if info_dict.get('_type') in ('playlist', 'multi_video'):
3204 for video_dict in info_dict.get('entries', {}):
3205 actual_post_extract(video_dict or {})
3206 return
3207
3208 post_extractor = info_dict.get('__post_extractor') or (lambda: {})
3209 extra = post_extractor().items()
3210 info_dict.update(extra)
3211 info_dict.pop('__post_extractor', None)
3212
3213 original_infodict = info_dict.get('__original_infodict') or {}
3214 original_infodict.update(extra)
3215 original_infodict.pop('__post_extractor', None)
3216
3217 actual_post_extract(info_dict or {})
3218
3219 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3220 info = dict(ie_info)
3221 info['__files_to_move'] = files_to_move or {}
3222 for pp in self._pps[key]:
3223 info = self.run_pp(pp, info)
3224 return info, info.pop('__files_to_move', None)
3225
3226 def post_process(self, filename, ie_info, files_to_move=None):
3227 """Run all the postprocessors on the given file."""
3228 info = dict(ie_info)
3229 info['filepath'] = filename
3230 info['__files_to_move'] = files_to_move or {}
3231
3232 for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
3233 info = self.run_pp(pp, info)
3234 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3235 del info['__files_to_move']
3236 for pp in self._pps['after_move']:
3237 info = self.run_pp(pp, info)
3238 return info
3239
3240 def _make_archive_id(self, info_dict):
3241 video_id = info_dict.get('id')
3242 if not video_id:
3243 return
3244 # Future-proof against any change in case
3245 # and backwards compatibility with prior versions
3246 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3247 if extractor is None:
3248 url = str_or_none(info_dict.get('url'))
3249 if not url:
3250 return
3251 # Try to find matching extractor for the URL and take its ie_key
3252 for ie_key, ie in self._ies.items():
3253 if ie.suitable(url):
3254 extractor = ie_key
3255 break
3256 else:
3257 return
3258 return '%s %s' % (extractor.lower(), video_id)
3259
3260 def in_download_archive(self, info_dict):
3261 fn = self.params.get('download_archive')
3262 if fn is None:
3263 return False
3264
3265 vid_id = self._make_archive_id(info_dict)
3266 if not vid_id:
3267 return False # Incomplete video information
3268
3269 return vid_id in self.archive
3270
3271 def record_download_archive(self, info_dict):
3272 fn = self.params.get('download_archive')
3273 if fn is None:
3274 return
3275 vid_id = self._make_archive_id(info_dict)
3276 assert vid_id
3277 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3278 archive_file.write(vid_id + '\n')
3279 self.archive.add(vid_id)
3280
3281 @staticmethod
3282 def format_resolution(format, default='unknown'):
3283 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3284 return 'audio only'
3285 if format.get('resolution') is not None:
3286 return format['resolution']
3287 if format.get('width') and format.get('height'):
3288 return '%dx%d' % (format['width'], format['height'])
3289 elif format.get('height'):
3290 return '%sp' % format['height']
3291 elif format.get('width'):
3292 return '%dx?' % format['width']
3293 return default
3294
3295 def _format_note(self, fdict):
3296 res = ''
3297 if fdict.get('ext') in ['f4f', 'f4m']:
3298 res += '(unsupported)'
3299 if fdict.get('language'):
3300 if res:
3301 res += ' '
3302 res += '[%s]' % fdict['language']
3303 if fdict.get('format_note') is not None:
3304 if res:
3305 res += ' '
3306 res += fdict['format_note']
3307 if fdict.get('tbr') is not None:
3308 if res:
3309 res += ', '
3310 res += '%4dk' % fdict['tbr']
3311 if fdict.get('container') is not None:
3312 if res:
3313 res += ', '
3314 res += '%s container' % fdict['container']
3315 if (fdict.get('vcodec') is not None
3316 and fdict.get('vcodec') != 'none'):
3317 if res:
3318 res += ', '
3319 res += fdict['vcodec']
3320 if fdict.get('vbr') is not None:
3321 res += '@'
3322 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3323 res += 'video@'
3324 if fdict.get('vbr') is not None:
3325 res += '%4dk' % fdict['vbr']
3326 if fdict.get('fps') is not None:
3327 if res:
3328 res += ', '
3329 res += '%sfps' % fdict['fps']
3330 if fdict.get('acodec') is not None:
3331 if res:
3332 res += ', '
3333 if fdict['acodec'] == 'none':
3334 res += 'video only'
3335 else:
3336 res += '%-5s' % fdict['acodec']
3337 elif fdict.get('abr') is not None:
3338 if res:
3339 res += ', '
3340 res += 'audio'
3341 if fdict.get('abr') is not None:
3342 res += '@%3dk' % fdict['abr']
3343 if fdict.get('asr') is not None:
3344 res += ' (%5dHz)' % fdict['asr']
3345 if fdict.get('filesize') is not None:
3346 if res:
3347 res += ', '
3348 res += format_bytes(fdict['filesize'])
3349 elif fdict.get('filesize_approx') is not None:
3350 if res:
3351 res += ', '
3352 res += '~' + format_bytes(fdict['filesize_approx'])
3353 return res
3354
3355 def _list_format_headers(self, *headers):
3356 if self.params.get('listformats_table', True) is not False:
3357 return [self._format_screen(header, self.Styles.HEADERS) for header in headers]
3358 return headers
3359
3360 def list_formats(self, info_dict):
3361 if not info_dict.get('formats') and not info_dict.get('url'):
3362 self.to_screen('%s has no formats' % info_dict['id'])
3363 return
3364 self.to_screen('[info] Available formats for %s:' % info_dict['id'])
3365
3366 formats = info_dict.get('formats', [info_dict])
3367 new_format = self.params.get('listformats_table', True) is not False
3368 if new_format:
3369 delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3370 table = [
3371 [
3372 self._format_screen(format_field(f, 'format_id'), self.Styles.ID),
3373 format_field(f, 'ext'),
3374 format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
3375 format_field(f, 'fps', '\t%d'),
3376 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3377 delim,
3378 format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
3379 format_field(f, 'tbr', '\t%dk'),
3380 shorten_protocol_name(f.get('protocol', '')),
3381 delim,
3382 format_field(f, 'vcodec', default='unknown').replace(
3383 'none',
3384 'images' if f.get('acodec') == 'none'
3385 else self._format_screen('audio only', self.Styles.SUPPRESS)),
3386 format_field(f, 'vbr', '\t%dk'),
3387 format_field(f, 'acodec', default='unknown').replace(
3388 'none',
3389 '' if f.get('vcodec') == 'none'
3390 else self._format_screen('video only', self.Styles.SUPPRESS)),
3391 format_field(f, 'abr', '\t%dk'),
3392 format_field(f, 'asr', '\t%dHz'),
3393 join_nonempty(
3394 self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
3395 format_field(f, 'language', '[%s]'),
3396 join_nonempty(
3397 format_field(f, 'format_note'),
3398 format_field(f, 'container', ignore=(None, f.get('ext'))),
3399 delim=', '),
3400 delim=' '),
3401 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3402 header_line = self._list_format_headers(
3403 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
3404 delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
3405 else:
3406 table = [
3407 [
3408 format_field(f, 'format_id'),
3409 format_field(f, 'ext'),
3410 self.format_resolution(f),
3411 self._format_note(f)]
3412 for f in formats
3413 if f.get('preference') is None or f['preference'] >= -1000]
3414 header_line = ['format code', 'extension', 'resolution', 'note']
3415
3416 self.to_stdout(render_table(
3417 header_line, table,
3418 extra_gap=(0 if new_format else 1),
3419 hide_empty=new_format,
3420 delim=new_format and self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True)))
3421
3422 def list_thumbnails(self, info_dict):
3423 thumbnails = list(info_dict.get('thumbnails'))
3424 if not thumbnails:
3425 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3426 return
3427
3428 self.to_screen(
3429 '[info] Thumbnails for %s:' % info_dict['id'])
3430 self.to_stdout(render_table(
3431 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3432 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3433
3434 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3435 if not subtitles:
3436 self.to_screen('%s has no %s' % (video_id, name))
3437 return
3438 self.to_screen(
3439 'Available %s for %s:' % (name, video_id))
3440
3441 def _row(lang, formats):
3442 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3443 if len(set(names)) == 1:
3444 names = [] if names[0] == 'unknown' else names[:1]
3445 return [lang, ', '.join(names), ', '.join(exts)]
3446
3447 self.to_stdout(render_table(
3448 self._list_format_headers('Language', 'Name', 'Formats'),
3449 [_row(lang, formats) for lang, formats in subtitles.items()],
3450 hide_empty=True))
3451
3452 def urlopen(self, req):
3453 """ Start an HTTP download """
3454 if isinstance(req, compat_basestring):
3455 req = sanitized_Request(req)
3456 return self._opener.open(req, timeout=self._socket_timeout)
3457
3458 def print_debug_header(self):
3459 if not self.params.get('verbose'):
3460 return
3461
3462 def get_encoding(stream):
3463 ret = getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)
3464 if not supports_terminal_sequences(stream):
3465 from .compat import WINDOWS_VT_MODE
3466 ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
3467 return ret
3468
3469 encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % (
3470 locale.getpreferredencoding(),
3471 sys.getfilesystemencoding(),
3472 get_encoding(self._screen_file), get_encoding(self._err_file),
3473 self.get_encoding())
3474
3475 logger = self.params.get('logger')
3476 if logger:
3477 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
3478 write_debug(encoding_str)
3479 else:
3480 write_string(f'[debug] {encoding_str}\n', encoding=None)
3481 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
3482
3483 source = detect_variant()
3484 write_debug(join_nonempty(
3485 'yt-dlp version', __version__,
3486 f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
3487 '' if source == 'unknown' else f'({source})',
3488 delim=' '))
3489 if not _LAZY_LOADER:
3490 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
3491 write_debug('Lazy loading extractors is forcibly disabled')
3492 else:
3493 write_debug('Lazy loading extractors is disabled')
3494 if plugin_extractors or plugin_postprocessors:
3495 write_debug('Plugins: %s' % [
3496 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3497 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3498 if self.params.get('compat_opts'):
3499 write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts')))
3500
3501 if source == 'source':
3502 try:
3503 sp = Popen(
3504 ['git', 'rev-parse', '--short', 'HEAD'],
3505 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3506 cwd=os.path.dirname(os.path.abspath(__file__)))
3507 out, err = sp.communicate_or_kill()
3508 out = out.decode().strip()
3509 if re.match('[0-9a-f]+', out):
3510 write_debug('Git HEAD: %s' % out)
3511 except Exception:
3512 try:
3513 sys.exc_clear()
3514 except Exception:
3515 pass
3516
3517 def python_implementation():
3518 impl_name = platform.python_implementation()
3519 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3520 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3521 return impl_name
3522
3523 write_debug('Python version %s (%s %s) - %s' % (
3524 platform.python_version(),
3525 python_implementation(),
3526 platform.architecture()[0],
3527 platform_name()))
3528
3529 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
3530 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
3531 if ffmpeg_features:
3532 exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features)
3533
3534 exe_versions['rtmpdump'] = rtmpdump_version()
3535 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3536 exe_str = ', '.join(
3537 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3538 ) or 'none'
3539 write_debug('exe versions: %s' % exe_str)
3540
3541 from .downloader.websocket import has_websockets
3542 from .postprocessor.embedthumbnail import has_mutagen
3543 from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
3544
3545 lib_str = join_nonempty(
3546 compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0],
3547 KEYRING_AVAILABLE and 'keyring',
3548 has_mutagen and 'mutagen',
3549 SQLITE_AVAILABLE and 'sqlite',
3550 has_websockets and 'websockets',
3551 delim=', ') or 'none'
3552 write_debug('Optional libraries: %s' % lib_str)
3553
3554 proxy_map = {}
3555 for handler in self._opener.handlers:
3556 if hasattr(handler, 'proxies'):
3557 proxy_map.update(handler.proxies)
3558 write_debug(f'Proxy map: {proxy_map}')
3559
3560 # Not implemented
3561 if False and self.params.get('call_home'):
3562 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3563 write_debug('Public IP address: %s' % ipaddr)
3564 latest_version = self.urlopen(
3565 'https://yt-dl.org/latest/version').read().decode('utf-8')
3566 if version_tuple(latest_version) > version_tuple(__version__):
3567 self.report_warning(
3568 'You are using an outdated version (newest version: %s)! '
3569 'See https://yt-dl.org/update if you need help updating.' %
3570 latest_version)
3571
3572 def _setup_opener(self):
3573 timeout_val = self.params.get('socket_timeout')
3574 self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
3575
3576 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3577 opts_cookiefile = self.params.get('cookiefile')
3578 opts_proxy = self.params.get('proxy')
3579
3580 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3581
3582 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3583 if opts_proxy is not None:
3584 if opts_proxy == '':
3585 proxies = {}
3586 else:
3587 proxies = {'http': opts_proxy, 'https': opts_proxy}
3588 else:
3589 proxies = compat_urllib_request.getproxies()
3590 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3591 if 'http' in proxies and 'https' not in proxies:
3592 proxies['https'] = proxies['http']
3593 proxy_handler = PerRequestProxyHandler(proxies)
3594
3595 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3596 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3597 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3598 redirect_handler = YoutubeDLRedirectHandler()
3599 data_handler = compat_urllib_request_DataHandler()
3600
3601 # When passing our own FileHandler instance, build_opener won't add the
3602 # default FileHandler and allows us to disable the file protocol, which
3603 # can be used for malicious purposes (see
3604 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3605 file_handler = compat_urllib_request.FileHandler()
3606
3607 def file_open(*args, **kwargs):
3608 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3609 file_handler.file_open = file_open
3610
3611 opener = compat_urllib_request.build_opener(
3612 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3613
3614 # Delete the default user-agent header, which would otherwise apply in
3615 # cases where our custom HTTP handler doesn't come into play
3616 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3617 opener.addheaders = []
3618 self._opener = opener
3619
3620 def encode(self, s):
3621 if isinstance(s, bytes):
3622 return s # Already encoded
3623
3624 try:
3625 return s.encode(self.get_encoding())
3626 except UnicodeEncodeError as err:
3627 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3628 raise
3629
3630 def get_encoding(self):
3631 encoding = self.params.get('encoding')
3632 if encoding is None:
3633 encoding = preferredencoding()
3634 return encoding
3635
3636 def _write_info_json(self, label, ie_result, infofn, overwrite=None):
3637 ''' Write infojson and returns True = written, False = skip, None = error '''
3638 if overwrite is None:
3639 overwrite = self.params.get('overwrites', True)
3640 if not self.params.get('writeinfojson'):
3641 return False
3642 elif not infofn:
3643 self.write_debug(f'Skipping writing {label} infojson')
3644 return False
3645 elif not self._ensure_dir_exists(infofn):
3646 return None
3647 elif not overwrite and os.path.exists(infofn):
3648 self.to_screen(f'[info] {label.title()} metadata is already present')
3649 else:
3650 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3651 try:
3652 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3653 except (OSError, IOError):
3654 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3655 return None
3656 return True
3657
3658 def _write_description(self, label, ie_result, descfn):
3659 ''' Write description and returns True = written, False = skip, None = error '''
3660 if not self.params.get('writedescription'):
3661 return False
3662 elif not descfn:
3663 self.write_debug(f'Skipping writing {label} description')
3664 return False
3665 elif not self._ensure_dir_exists(descfn):
3666 return None
3667 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3668 self.to_screen(f'[info] {label.title()} description is already present')
3669 elif ie_result.get('description') is None:
3670 self.report_warning(f'There\'s no {label} description to write')
3671 return False
3672 else:
3673 try:
3674 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3675 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3676 descfile.write(ie_result['description'])
3677 except (OSError, IOError):
3678 self.report_error(f'Cannot write {label} description file {descfn}')
3679 return None
3680 return True
3681
3682 def _write_subtitles(self, info_dict, filename):
3683 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3684 ret = []
3685 subtitles = info_dict.get('requested_subtitles')
3686 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3687 # subtitles download errors are already managed as troubles in relevant IE
3688 # that way it will silently go on when used with unsupporting IE
3689 return ret
3690
3691 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3692 if not sub_filename_base:
3693 self.to_screen('[info] Skipping writing video subtitles')
3694 return ret
3695 for sub_lang, sub_info in subtitles.items():
3696 sub_format = sub_info['ext']
3697 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3698 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3699 if not self.params.get('overwrites', True) and os.path.exists(sub_filename):
3700 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3701 sub_info['filepath'] = sub_filename
3702 ret.append((sub_filename, sub_filename_final))
3703 continue
3704
3705 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3706 if sub_info.get('data') is not None:
3707 try:
3708 # Use newline='' to prevent conversion of newline characters
3709 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3710 with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3711 subfile.write(sub_info['data'])
3712 sub_info['filepath'] = sub_filename
3713 ret.append((sub_filename, sub_filename_final))
3714 continue
3715 except (OSError, IOError):
3716 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3717 return None
3718
3719 try:
3720 sub_copy = sub_info.copy()
3721 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3722 self.dl(sub_filename, sub_copy, subtitle=True)
3723 sub_info['filepath'] = sub_filename
3724 ret.append((sub_filename, sub_filename_final))
3725 except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3726 self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
3727 continue
3728 return ret
3729
3730 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3731 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3732 write_all = self.params.get('write_all_thumbnails', False)
3733 thumbnails, ret = [], []
3734 if write_all or self.params.get('writethumbnail', False):
3735 thumbnails = info_dict.get('thumbnails') or []
3736 multiple = write_all and len(thumbnails) > 1
3737
3738 if thumb_filename_base is None:
3739 thumb_filename_base = filename
3740 if thumbnails and not thumb_filename_base:
3741 self.write_debug(f'Skipping writing {label} thumbnail')
3742 return ret
3743
3744 for idx, t in list(enumerate(thumbnails))[::-1]:
3745 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3746 thumb_display_id = f'{label} thumbnail {t["id"]}'
3747 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3748 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3749
3750 if not self.params.get('overwrites', True) and os.path.exists(thumb_filename):
3751 ret.append((thumb_filename, thumb_filename_final))
3752 t['filepath'] = thumb_filename
3753 self.to_screen('[info] %s is already present' % (
3754 thumb_display_id if multiple else f'{label} thumbnail').capitalize())
3755 else:
3756 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3757 try:
3758 uf = self.urlopen(t['url'])
3759 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3760 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3761 shutil.copyfileobj(uf, thumbf)
3762 ret.append((thumb_filename, thumb_filename_final))
3763 t['filepath'] = thumb_filename
3764 except network_exceptions as err:
3765 thumbnails.pop(idx)
3766 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3767 if ret and not write_all:
3768 break
3769 return ret