]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/YoutubeDL.py
Improved progress reporting (See desc) (#1125)
[yt-dlp.git] / yt_dlp / YoutubeDL.py
... / ...
CommitLineData
1#!/usr/bin/env python3
2# coding: utf-8
3
4from __future__ import absolute_import, unicode_literals
5
6import collections
7import contextlib
8import copy
9import datetime
10import errno
11import fileinput
12import io
13import itertools
14import json
15import locale
16import operator
17import os
18import platform
19import re
20import shutil
21import subprocess
22import sys
23import tempfile
24import time
25import tokenize
26import traceback
27import random
28import unicodedata
29
30from string import ascii_letters
31
32from .compat import (
33 compat_basestring,
34 compat_get_terminal_size,
35 compat_kwargs,
36 compat_numeric_types,
37 compat_os_name,
38 compat_pycrypto_AES,
39 compat_shlex_quote,
40 compat_str,
41 compat_tokenize_tokenize,
42 compat_urllib_error,
43 compat_urllib_request,
44 compat_urllib_request_DataHandler,
45 windows_enable_vt_mode,
46)
47from .cookies import load_cookies
48from .utils import (
49 age_restricted,
50 args_to_str,
51 ContentTooShortError,
52 date_from_str,
53 DateRange,
54 DEFAULT_OUTTMPL,
55 determine_ext,
56 determine_protocol,
57 DOT_DESKTOP_LINK_TEMPLATE,
58 DOT_URL_LINK_TEMPLATE,
59 DOT_WEBLOC_LINK_TEMPLATE,
60 DownloadError,
61 encode_compat_str,
62 encodeFilename,
63 EntryNotInPlaylist,
64 error_to_compat_str,
65 ExistingVideoReached,
66 expand_path,
67 ExtractorError,
68 float_or_none,
69 format_bytes,
70 format_field,
71 formatSeconds,
72 GeoRestrictedError,
73 HEADRequest,
74 int_or_none,
75 iri_to_uri,
76 ISO3166Utils,
77 LazyList,
78 locked_file,
79 make_dir,
80 make_HTTPS_handler,
81 MaxDownloadsReached,
82 network_exceptions,
83 orderedSet,
84 OUTTMPL_TYPES,
85 PagedList,
86 parse_filesize,
87 PerRequestProxyHandler,
88 platform_name,
89 PostProcessingError,
90 preferredencoding,
91 prepend_extension,
92 process_communicate_or_kill,
93 register_socks_protocols,
94 RejectedVideoReached,
95 render_table,
96 replace_extension,
97 SameFileError,
98 sanitize_filename,
99 sanitize_path,
100 sanitize_url,
101 sanitized_Request,
102 std_headers,
103 STR_FORMAT_RE_TMPL,
104 STR_FORMAT_TYPES,
105 str_or_none,
106 strftime_or_none,
107 subtitles_filename,
108 supports_terminal_sequences,
109 TERMINAL_SEQUENCES,
110 ThrottledDownload,
111 to_high_limit_path,
112 traverse_obj,
113 try_get,
114 UnavailableVideoError,
115 url_basename,
116 variadic,
117 version_tuple,
118 write_json_file,
119 write_string,
120 YoutubeDLCookieProcessor,
121 YoutubeDLHandler,
122 YoutubeDLRedirectHandler,
123)
124from .cache import Cache
125from .extractor import (
126 gen_extractor_classes,
127 get_info_extractor,
128 _LAZY_LOADER,
129 _PLUGIN_CLASSES as plugin_extractors
130)
131from .extractor.openload import PhantomJSwrapper
132from .downloader import (
133 FFmpegFD,
134 get_suitable_downloader,
135 shorten_protocol_name
136)
137from .downloader.rtmp import rtmpdump_version
138from .postprocessor import (
139 get_postprocessor,
140 FFmpegFixupDurationPP,
141 FFmpegFixupM3u8PP,
142 FFmpegFixupM4aPP,
143 FFmpegFixupStretchedPP,
144 FFmpegFixupTimestampPP,
145 FFmpegMergerPP,
146 FFmpegPostProcessor,
147 MoveFilesAfterDownloadPP,
148 _PLUGIN_CLASSES as plugin_postprocessors
149)
150from .update import detect_variant
151from .version import __version__
152
153if compat_os_name == 'nt':
154 import ctypes
155
156
157class YoutubeDL(object):
158 """YoutubeDL class.
159
160 YoutubeDL objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
166
167 For this, YoutubeDL objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the YoutubeDL object handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 YoutubeDL process the extracted information, possibly using a File
173 Downloader to download the video.
174
175 YoutubeDL objects accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The YoutubeDL also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
181
182 Available options:
183
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 videopassword: Password for accessing a video.
187 ap_mso: Adobe Pass multiple-system operator identifier.
188 ap_username: Multiple-system operator account username.
189 ap_password: Multiple-system operator account password.
190 usenetrc: Use netrc for authentication instead.
191 verbose: Print additional info to stdout.
192 quiet: Do not print messages to stdout.
193 no_warnings: Do not print out anything for warnings.
194 forceprint: A list of templates to force print
195 forceurl: Force printing final URL. (Deprecated)
196 forcetitle: Force printing title. (Deprecated)
197 forceid: Force printing ID. (Deprecated)
198 forcethumbnail: Force printing thumbnail URL. (Deprecated)
199 forcedescription: Force printing description. (Deprecated)
200 forcefilename: Force printing final filename. (Deprecated)
201 forceduration: Force printing duration. (Deprecated)
202 forcejson: Force printing info_dict as JSON.
203 dump_single_json: Force printing the info_dict of the whole playlist
204 (or video) as a single JSON line.
205 force_write_download_archive: Force writing download archive regardless
206 of 'skip_download' or 'simulate'.
207 simulate: Do not download the video files. If unset (or None),
208 simulate only if listsubtitles, listformats or list_thumbnails is used
209 format: Video format code. see "FORMAT SELECTION" for more details.
210 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
211 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
212 extracting metadata even if the video is not actually
213 available for download (experimental)
214 format_sort: How to sort the video formats. see "Sorting Formats"
215 for more details.
216 format_sort_force: Force the given format_sort. see "Sorting Formats"
217 for more details.
218 allow_multiple_video_streams: Allow multiple video streams to be merged
219 into a single file
220 allow_multiple_audio_streams: Allow multiple audio streams to be merged
221 into a single file
222 check_formats Whether to test if the formats are downloadable.
223 Can be True (check all), False (check none)
224 or None (check only if requested by extractor)
225 paths: Dictionary of output paths. The allowed keys are 'home'
226 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
227 outtmpl: Dictionary of templates for output names. Allowed keys
228 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
229 For compatibility with youtube-dl, a single string can also be used
230 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
231 restrictfilenames: Do not allow "&" and spaces in file names
232 trim_file_name: Limit length of filename (extension excluded)
233 windowsfilenames: Force the filenames to be windows compatible
234 ignoreerrors: Do not stop on download/postprocessing errors.
235 Can be 'only_download' to ignore only download errors.
236 Default is 'only_download' for CLI, but False for API
237 skip_playlist_after_errors: Number of allowed failures until the rest of
238 the playlist is skipped
239 force_generic_extractor: Force downloader to use the generic extractor
240 overwrites: Overwrite all video and metadata files if True,
241 overwrite only non-video files if None
242 and don't overwrite any file if False
243 For compatibility with youtube-dl,
244 "nooverwrites" may also be used instead
245 playliststart: Playlist item to start at.
246 playlistend: Playlist item to end at.
247 playlist_items: Specific indices of playlist to download.
248 playlistreverse: Download playlist items in reverse order.
249 playlistrandom: Download playlist items in random order.
250 matchtitle: Download only matching titles.
251 rejecttitle: Reject downloads for matching titles.
252 logger: Log messages to a logging.Logger instance.
253 logtostderr: Log messages to stderr instead of stdout.
254 consoletitle: Display progress in console window's titlebar.
255 writedescription: Write the video description to a .description file
256 writeinfojson: Write the video description to a .info.json file
257 clean_infojson: Remove private fields from the infojson
258 getcomments: Extract video comments. This will not be written to disk
259 unless writeinfojson is also given
260 writeannotations: Write the video annotations to a .annotations.xml file
261 writethumbnail: Write the thumbnail image to a file
262 allow_playlist_files: Whether to write playlists' description, infojson etc
263 also to disk when using the 'write*' options
264 write_all_thumbnails: Write all thumbnail formats to files
265 writelink: Write an internet shortcut file, depending on the
266 current platform (.url/.webloc/.desktop)
267 writeurllink: Write a Windows internet shortcut file (.url)
268 writewebloclink: Write a macOS internet shortcut file (.webloc)
269 writedesktoplink: Write a Linux internet shortcut file (.desktop)
270 writesubtitles: Write the video subtitles to a file
271 writeautomaticsub: Write the automatically generated subtitles to a file
272 allsubtitles: Deprecated - Use subtitleslangs = ['all']
273 Downloads all the subtitles of the video
274 (requires writesubtitles or writeautomaticsub)
275 listsubtitles: Lists all available subtitles for the video
276 subtitlesformat: The format code for subtitles
277 subtitleslangs: List of languages of the subtitles to download (can be regex).
278 The list may contain "all" to refer to all the available
279 subtitles. The language can be prefixed with a "-" to
280 exclude it from the requested languages. Eg: ['all', '-live_chat']
281 keepvideo: Keep the video file after post-processing
282 daterange: A DateRange object, download only if the upload_date is in the range.
283 skip_download: Skip the actual download of the video file
284 cachedir: Location of the cache files in the filesystem.
285 False to disable filesystem cache.
286 noplaylist: Download single video instead of a playlist if in doubt.
287 age_limit: An integer representing the user's age in years.
288 Unsuitable videos for the given age are skipped.
289 min_views: An integer representing the minimum view count the video
290 must have in order to not be skipped.
291 Videos without view count information are always
292 downloaded. None for no limit.
293 max_views: An integer representing the maximum view count.
294 Videos that are more popular than that are not
295 downloaded.
296 Videos without view count information are always
297 downloaded. None for no limit.
298 download_archive: File name of a file where all downloads are recorded.
299 Videos already present in the file are not downloaded
300 again.
301 break_on_existing: Stop the download process after attempting to download a
302 file that is in the archive.
303 break_on_reject: Stop the download process when encountering a video that
304 has been filtered out.
305 cookiefile: File name where cookies should be read from and dumped to
306 cookiesfrombrowser: A tuple containing the name of the browser and the profile
307 name/path from where cookies are loaded.
308 Eg: ('chrome', ) or (vivaldi, 'default')
309 nocheckcertificate:Do not verify SSL certificates
310 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
311 At the moment, this is only supported by YouTube.
312 proxy: URL of the proxy server to use
313 geo_verification_proxy: URL of the proxy to use for IP address verification
314 on geo-restricted sites.
315 socket_timeout: Time to wait for unresponsive hosts, in seconds
316 bidi_workaround: Work around buggy terminals without bidirectional text
317 support, using fridibi
318 debug_printtraffic:Print out sent and received HTTP traffic
319 include_ads: Download ads as well
320 default_search: Prepend this string if an input url is not valid.
321 'auto' for elaborate guessing
322 encoding: Use this encoding instead of the system-specified.
323 extract_flat: Do not resolve URLs, return the immediate result.
324 Pass in 'in_playlist' to only show this behavior for
325 playlist items.
326 postprocessors: A list of dictionaries, each with an entry
327 * key: The name of the postprocessor. See
328 yt_dlp/postprocessor/__init__.py for a list.
329 * when: When to run the postprocessor. Can be one of
330 pre_process|before_dl|post_process|after_move.
331 Assumed to be 'post_process' if not given
332 post_hooks: A list of functions that get called as the final step
333 for each video file, after all postprocessors have been
334 called. The filename will be passed as the only argument.
335 progress_hooks: A list of functions that get called on download
336 progress, with a dictionary with the entries
337 * status: One of "downloading", "error", or "finished".
338 Check this first and ignore unknown values.
339 * info_dict: The extracted info_dict
340
341 If status is one of "downloading", or "finished", the
342 following properties may also be present:
343 * filename: The final filename (always present)
344 * tmpfilename: The filename we're currently writing to
345 * downloaded_bytes: Bytes on disk
346 * total_bytes: Size of the whole file, None if unknown
347 * total_bytes_estimate: Guess of the eventual file size,
348 None if unavailable.
349 * elapsed: The number of seconds since download started.
350 * eta: The estimated time in seconds, None if unknown
351 * speed: The download speed in bytes/second, None if
352 unknown
353 * fragment_index: The counter of the currently
354 downloaded video fragment.
355 * fragment_count: The number of fragments (= individual
356 files that will be merged)
357
358 Progress hooks are guaranteed to be called at least once
359 (with status "finished") if the download is successful.
360 postprocessor_hooks: A list of functions that get called on postprocessing
361 progress, with a dictionary with the entries
362 * status: One of "started", "processing", or "finished".
363 Check this first and ignore unknown values.
364 * postprocessor: Name of the postprocessor
365 * info_dict: The extracted info_dict
366
367 Progress hooks are guaranteed to be called at least twice
368 (with status "started" and "finished") if the processing is successful.
369 merge_output_format: Extension to use when merging formats.
370 final_ext: Expected final extension; used to detect when the file was
371 already downloaded and converted. "merge_output_format" is
372 replaced by this extension when given
373 fixup: Automatically correct known faults of the file.
374 One of:
375 - "never": do nothing
376 - "warn": only emit a warning
377 - "detect_or_warn": check whether we can do anything
378 about it, warn otherwise (default)
379 source_address: Client-side IP address to bind to.
380 call_home: Boolean, true iff we are allowed to contact the
381 yt-dlp servers for debugging. (BROKEN)
382 sleep_interval_requests: Number of seconds to sleep between requests
383 during extraction
384 sleep_interval: Number of seconds to sleep before each download when
385 used alone or a lower bound of a range for randomized
386 sleep before each download (minimum possible number
387 of seconds to sleep) when used along with
388 max_sleep_interval.
389 max_sleep_interval:Upper bound of a range for randomized sleep before each
390 download (maximum possible number of seconds to sleep).
391 Must only be used along with sleep_interval.
392 Actual sleep time will be a random float from range
393 [sleep_interval; max_sleep_interval].
394 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
395 listformats: Print an overview of available video formats and exit.
396 list_thumbnails: Print a table of all thumbnails and exit.
397 match_filter: A function that gets called with the info_dict of
398 every video.
399 If it returns a message, the video is ignored.
400 If it returns None, the video is downloaded.
401 match_filter_func in utils.py is one example for this.
402 no_color: Do not emit color codes in output.
403 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
404 HTTP header
405 geo_bypass_country:
406 Two-letter ISO 3166-2 country code that will be used for
407 explicit geographic restriction bypassing via faking
408 X-Forwarded-For HTTP header
409 geo_bypass_ip_block:
410 IP range in CIDR notation that will be used similarly to
411 geo_bypass_country
412
413 The following options determine which downloader is picked:
414 external_downloader: A dictionary of protocol keys and the executable of the
415 external downloader to use for it. The allowed protocols
416 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
417 Set the value to 'native' to use the native downloader
418 hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
419 or {'m3u8': 'ffmpeg'} instead.
420 Use the native HLS downloader instead of ffmpeg/avconv
421 if True, otherwise use ffmpeg/avconv if False, otherwise
422 use downloader suggested by extractor if None.
423 compat_opts: Compatibility options. See "Differences in default behavior".
424 The following options do not work when used through the API:
425 filename, abort-on-error, multistreams, no-live-chat,
426 no-clean-infojson, no-playlist-metafiles, no-keep-subs.
427 Refer __init__.py for their implementation
428 progress_template: Dictionary of templates for progress outputs.
429 Allowed keys are 'download', 'postprocess',
430 'download-title' (console title) and 'postprocess-title'.
431 The template is mapped on a dictionary with keys 'progress' and 'info'
432
433 The following parameters are not used by YoutubeDL itself, they are used by
434 the downloader (see yt_dlp/downloader/common.py):
435 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
436 max_filesize, test, noresizebuffer, retries, continuedl, noprogress,
437 xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size.
438
439 The following options are used by the post processors:
440 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
441 otherwise prefer ffmpeg. (avconv support is deprecated)
442 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
443 to the binary or its containing directory.
444 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
445 and a list of additional command-line arguments for the
446 postprocessor/executable. The dict can also have "PP+EXE" keys
447 which are used when the given exe is used by the given PP.
448 Use 'default' as the name for arguments to passed to all PP
449 For compatibility with youtube-dl, a single list of args
450 can also be used
451
452 The following options are used by the extractors:
453 extractor_retries: Number of times to retry for known errors
454 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
455 hls_split_discontinuity: Split HLS playlists to different formats at
456 discontinuities such as ad breaks (default: False)
457 extractor_args: A dictionary of arguments to be passed to the extractors.
458 See "EXTRACTOR ARGUMENTS" for details.
459 Eg: {'youtube': {'skip': ['dash', 'hls']}}
460 youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
461 If True (default), DASH manifests and related
462 data will be downloaded and processed by extractor.
463 You can reduce network I/O by disabling it if you don't
464 care about DASH. (only for youtube)
465 youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
466 If True (default), HLS manifests and related
467 data will be downloaded and processed by extractor.
468 You can reduce network I/O by disabling it if you don't
469 care about HLS. (only for youtube)
470 """
471
472 _NUMERIC_FIELDS = set((
473 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
474 'timestamp', 'release_timestamp',
475 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
476 'average_rating', 'comment_count', 'age_limit',
477 'start_time', 'end_time',
478 'chapter_number', 'season_number', 'episode_number',
479 'track_number', 'disc_number', 'release_year',
480 ))
481
482 params = None
483 _ies = {}
484 _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
485 _printed_messages = set()
486 _first_webpage_request = True
487 _download_retcode = None
488 _num_downloads = None
489 _playlist_level = 0
490 _playlist_urls = set()
491 _screen_file = None
492
493 def __init__(self, params=None, auto_init=True):
494 """Create a FileDownloader object with the given options."""
495 if params is None:
496 params = {}
497 self._ies = {}
498 self._ies_instances = {}
499 self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
500 self._printed_messages = set()
501 self._first_webpage_request = True
502 self._post_hooks = []
503 self._progress_hooks = []
504 self._postprocessor_hooks = []
505 self._download_retcode = 0
506 self._num_downloads = 0
507 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
508 self._err_file = sys.stderr
509 self.params = params
510 self.cache = Cache(self)
511
512 windows_enable_vt_mode()
513 self.params['no_color'] = self.params.get('no_color') or not supports_terminal_sequences(self._err_file)
514
515 if sys.version_info < (3, 6):
516 self.report_warning(
517 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
518
519 if self.params.get('allow_unplayable_formats'):
520 self.report_warning(
521 f'You have asked for {self._color_text("unplayable formats", "blue")} to be listed/downloaded. '
522 'This is a developer option intended for debugging. \n'
523 ' If you experience any issues while using this option, '
524 f'{self._color_text("DO NOT", "red")} open a bug report')
525
526 def check_deprecated(param, option, suggestion):
527 if self.params.get(param) is not None:
528 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
529 return True
530 return False
531
532 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
533 if self.params.get('geo_verification_proxy') is None:
534 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
535
536 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
537 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
538 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
539
540 for msg in self.params.get('warnings', []):
541 self.report_warning(msg)
542
543 if self.params.get('overwrites') is None:
544 self.params.pop('overwrites', None)
545 elif self.params.get('nooverwrites') is not None:
546 # nooverwrites was unnecessarily changed to overwrites
547 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
548 # This ensures compatibility with both keys
549 self.params['overwrites'] = not self.params['nooverwrites']
550 else:
551 self.params['nooverwrites'] = not self.params['overwrites']
552
553 if params.get('bidi_workaround', False):
554 try:
555 import pty
556 master, slave = pty.openpty()
557 width = compat_get_terminal_size().columns
558 if width is None:
559 width_args = []
560 else:
561 width_args = ['-w', str(width)]
562 sp_kwargs = dict(
563 stdin=subprocess.PIPE,
564 stdout=slave,
565 stderr=self._err_file)
566 try:
567 self._output_process = subprocess.Popen(
568 ['bidiv'] + width_args, **sp_kwargs
569 )
570 except OSError:
571 self._output_process = subprocess.Popen(
572 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
573 self._output_channel = os.fdopen(master, 'rb')
574 except OSError as ose:
575 if ose.errno == errno.ENOENT:
576 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
577 else:
578 raise
579
580 if (sys.platform != 'win32'
581 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
582 and not params.get('restrictfilenames', False)):
583 # Unicode filesystem API will throw errors (#1474, #13027)
584 self.report_warning(
585 'Assuming --restrict-filenames since file system encoding '
586 'cannot encode all characters. '
587 'Set the LC_ALL environment variable to fix this.')
588 self.params['restrictfilenames'] = True
589
590 self.outtmpl_dict = self.parse_outtmpl()
591
592 # Creating format selector here allows us to catch syntax errors before the extraction
593 self.format_selector = (
594 None if self.params.get('format') is None
595 else self.build_format_selector(self.params['format']))
596
597 self._setup_opener()
598
599 def preload_download_archive(fn):
600 """Preload the archive, if any is specified"""
601 if fn is None:
602 return False
603 self.write_debug('Loading archive file %r\n' % fn)
604 try:
605 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
606 for line in archive_file:
607 self.archive.add(line.strip())
608 except IOError as ioe:
609 if ioe.errno != errno.ENOENT:
610 raise
611 return False
612 return True
613
614 self.archive = set()
615 preload_download_archive(self.params.get('download_archive'))
616
617 if auto_init:
618 self.print_debug_header()
619 self.add_default_info_extractors()
620
621 for pp_def_raw in self.params.get('postprocessors', []):
622 pp_def = dict(pp_def_raw)
623 when = pp_def.pop('when', 'post_process')
624 pp_class = get_postprocessor(pp_def.pop('key'))
625 pp = pp_class(self, **compat_kwargs(pp_def))
626 self.add_post_processor(pp, when=when)
627
628 for ph in self.params.get('post_hooks', []):
629 self.add_post_hook(ph)
630
631 for ph in self.params.get('progress_hooks', []):
632 self.add_progress_hook(ph)
633
634 register_socks_protocols()
635
636 def warn_if_short_id(self, argv):
637 # short YouTube ID starting with dash?
638 idxs = [
639 i for i, a in enumerate(argv)
640 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
641 if idxs:
642 correct_argv = (
643 ['yt-dlp']
644 + [a for i, a in enumerate(argv) if i not in idxs]
645 + ['--'] + [argv[i] for i in idxs]
646 )
647 self.report_warning(
648 'Long argument string detected. '
649 'Use -- to separate parameters and URLs, like this:\n%s\n' %
650 args_to_str(correct_argv))
651
652 def add_info_extractor(self, ie):
653 """Add an InfoExtractor object to the end of the list."""
654 ie_key = ie.ie_key()
655 self._ies[ie_key] = ie
656 if not isinstance(ie, type):
657 self._ies_instances[ie_key] = ie
658 ie.set_downloader(self)
659
660 def _get_info_extractor_class(self, ie_key):
661 ie = self._ies.get(ie_key)
662 if ie is None:
663 ie = get_info_extractor(ie_key)
664 self.add_info_extractor(ie)
665 return ie
666
667 def get_info_extractor(self, ie_key):
668 """
669 Get an instance of an IE with name ie_key, it will try to get one from
670 the _ies list, if there's no instance it will create a new one and add
671 it to the extractor list.
672 """
673 ie = self._ies_instances.get(ie_key)
674 if ie is None:
675 ie = get_info_extractor(ie_key)()
676 self.add_info_extractor(ie)
677 return ie
678
679 def add_default_info_extractors(self):
680 """
681 Add the InfoExtractors returned by gen_extractors to the end of the list
682 """
683 for ie in gen_extractor_classes():
684 self.add_info_extractor(ie)
685
686 def add_post_processor(self, pp, when='post_process'):
687 """Add a PostProcessor object to the end of the chain."""
688 self._pps[when].append(pp)
689 pp.set_downloader(self)
690
691 def add_post_hook(self, ph):
692 """Add the post hook"""
693 self._post_hooks.append(ph)
694
695 def add_progress_hook(self, ph):
696 """Add the download progress hook"""
697 self._progress_hooks.append(ph)
698
699 def add_postprocessor_hook(self, ph):
700 """Add the postprocessing progress hook"""
701 self._postprocessor_hooks.append(ph)
702
703 def _bidi_workaround(self, message):
704 if not hasattr(self, '_output_channel'):
705 return message
706
707 assert hasattr(self, '_output_process')
708 assert isinstance(message, compat_str)
709 line_count = message.count('\n') + 1
710 self._output_process.stdin.write((message + '\n').encode('utf-8'))
711 self._output_process.stdin.flush()
712 res = ''.join(self._output_channel.readline().decode('utf-8')
713 for _ in range(line_count))
714 return res[:-len('\n')]
715
716 def _write_string(self, message, out=None, only_once=False):
717 if only_once:
718 if message in self._printed_messages:
719 return
720 self._printed_messages.add(message)
721 write_string(message, out=out, encoding=self.params.get('encoding'))
722
723 def to_stdout(self, message, skip_eol=False, quiet=False):
724 """Print message to stdout"""
725 if self.params.get('logger'):
726 self.params['logger'].debug(message)
727 elif not quiet or self.params.get('verbose'):
728 self._write_string(
729 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
730 self._err_file if quiet else self._screen_file)
731
732 def to_stderr(self, message, only_once=False):
733 """Print message to stderr"""
734 assert isinstance(message, compat_str)
735 if self.params.get('logger'):
736 self.params['logger'].error(message)
737 else:
738 self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
739
740 def to_console_title(self, message):
741 if not self.params.get('consoletitle', False):
742 return
743 if compat_os_name == 'nt':
744 if ctypes.windll.kernel32.GetConsoleWindow():
745 # c_wchar_p() might not be necessary if `message` is
746 # already of type unicode()
747 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
748 elif 'TERM' in os.environ:
749 self._write_string('\033]0;%s\007' % message, self._screen_file)
750
751 def save_console_title(self):
752 if not self.params.get('consoletitle', False):
753 return
754 if self.params.get('simulate'):
755 return
756 if compat_os_name != 'nt' and 'TERM' in os.environ:
757 # Save the title on stack
758 self._write_string('\033[22;0t', self._screen_file)
759
760 def restore_console_title(self):
761 if not self.params.get('consoletitle', False):
762 return
763 if self.params.get('simulate'):
764 return
765 if compat_os_name != 'nt' and 'TERM' in os.environ:
766 # Restore the title from stack
767 self._write_string('\033[23;0t', self._screen_file)
768
769 def __enter__(self):
770 self.save_console_title()
771 return self
772
773 def __exit__(self, *args):
774 self.restore_console_title()
775
776 if self.params.get('cookiefile') is not None:
777 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
778
779 def trouble(self, message=None, tb=None):
780 """Determine action to take when a download problem appears.
781
782 Depending on if the downloader has been configured to ignore
783 download errors or not, this method may throw an exception or
784 not when errors are found, after printing the message.
785
786 tb, if given, is additional traceback information.
787 """
788 if message is not None:
789 self.to_stderr(message)
790 if self.params.get('verbose'):
791 if tb is None:
792 if sys.exc_info()[0]: # if .trouble has been called from an except block
793 tb = ''
794 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
795 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
796 tb += encode_compat_str(traceback.format_exc())
797 else:
798 tb_data = traceback.format_list(traceback.extract_stack())
799 tb = ''.join(tb_data)
800 if tb:
801 self.to_stderr(tb)
802 if not self.params.get('ignoreerrors'):
803 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
804 exc_info = sys.exc_info()[1].exc_info
805 else:
806 exc_info = sys.exc_info()
807 raise DownloadError(message, exc_info)
808 self._download_retcode = 1
809
810 def to_screen(self, message, skip_eol=False):
811 """Print message to stdout if not in quiet mode"""
812 self.to_stdout(
813 message, skip_eol, quiet=self.params.get('quiet', False))
814
815 def _color_text(self, text, color):
816 if self.params.get('no_color'):
817 return text
818 return f'{TERMINAL_SEQUENCES[color.upper()]}{text}{TERMINAL_SEQUENCES["RESET_STYLE"]}'
819
820 def report_warning(self, message, only_once=False):
821 '''
822 Print the message to stderr, it will be prefixed with 'WARNING:'
823 If stderr is a tty file the 'WARNING:' will be colored
824 '''
825 if self.params.get('logger') is not None:
826 self.params['logger'].warning(message)
827 else:
828 if self.params.get('no_warnings'):
829 return
830 self.to_stderr(f'{self._color_text("WARNING:", "yellow")} {message}', only_once)
831
832 def report_error(self, message, tb=None):
833 '''
834 Do the same as trouble, but prefixes the message with 'ERROR:', colored
835 in red if stderr is a tty file.
836 '''
837 self.trouble(f'{self._color_text("ERROR:", "red")} {message}', tb)
838
839 def write_debug(self, message, only_once=False):
840 '''Log debug message or Print message to stderr'''
841 if not self.params.get('verbose', False):
842 return
843 message = '[debug] %s' % message
844 if self.params.get('logger'):
845 self.params['logger'].debug(message)
846 else:
847 self.to_stderr(message, only_once)
848
849 def report_file_already_downloaded(self, file_name):
850 """Report file has already been fully downloaded."""
851 try:
852 self.to_screen('[download] %s has already been downloaded' % file_name)
853 except UnicodeEncodeError:
854 self.to_screen('[download] The file has already been downloaded')
855
856 def report_file_delete(self, file_name):
857 """Report that existing file will be deleted."""
858 try:
859 self.to_screen('Deleting existing file %s' % file_name)
860 except UnicodeEncodeError:
861 self.to_screen('Deleting existing file')
862
863 def raise_no_formats(self, info, forced=False):
864 has_drm = info.get('__has_drm')
865 msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
866 expected = self.params.get('ignore_no_formats_error')
867 if forced or not expected:
868 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
869 expected=has_drm or expected)
870 else:
871 self.report_warning(msg)
872
873 def parse_outtmpl(self):
874 outtmpl_dict = self.params.get('outtmpl', {})
875 if not isinstance(outtmpl_dict, dict):
876 outtmpl_dict = {'default': outtmpl_dict}
877 outtmpl_dict.update({
878 k: v for k, v in DEFAULT_OUTTMPL.items()
879 if outtmpl_dict.get(k) is None})
880 for key, val in outtmpl_dict.items():
881 if isinstance(val, bytes):
882 self.report_warning(
883 'Parameter outtmpl is bytes, but should be a unicode string. '
884 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
885 return outtmpl_dict
886
887 def get_output_path(self, dir_type='', filename=None):
888 paths = self.params.get('paths', {})
889 assert isinstance(paths, dict)
890 path = os.path.join(
891 expand_path(paths.get('home', '').strip()),
892 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
893 filename or '')
894
895 # Temporary fix for #4787
896 # 'Treat' all problem characters by passing filename through preferredencoding
897 # to workaround encoding issues with subprocess on python2 @ Windows
898 if sys.version_info < (3, 0) and sys.platform == 'win32':
899 path = encodeFilename(path, True).decode(preferredencoding())
900 return sanitize_path(path, force=self.params.get('windowsfilenames'))
901
902 @staticmethod
903 def _outtmpl_expandpath(outtmpl):
904 # expand_path translates '%%' into '%' and '$$' into '$'
905 # correspondingly that is not what we want since we need to keep
906 # '%%' intact for template dict substitution step. Working around
907 # with boundary-alike separator hack.
908 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
909 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
910
911 # outtmpl should be expand_path'ed before template dict substitution
912 # because meta fields may contain env variables we don't want to
913 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
914 # title "Hello $PATH", we don't want `$PATH` to be expanded.
915 return expand_path(outtmpl).replace(sep, '')
916
917 @staticmethod
918 def escape_outtmpl(outtmpl):
919 ''' Escape any remaining strings like %s, %abc% etc. '''
920 return re.sub(
921 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
922 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
923 outtmpl)
924
925 @classmethod
926 def validate_outtmpl(cls, outtmpl):
927 ''' @return None or Exception object '''
928 outtmpl = re.sub(
929 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'),
930 lambda mobj: f'{mobj.group(0)[:-1]}s',
931 cls._outtmpl_expandpath(outtmpl))
932 try:
933 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
934 return None
935 except ValueError as err:
936 return err
937
938 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
939 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict """
940 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
941
942 info_dict = dict(info_dict) # Do not sanitize so as not to consume LazyList
943 for key in ('__original_infodict', '__postprocessors'):
944 info_dict.pop(key, None)
945 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
946 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
947 if info_dict.get('duration', None) is not None
948 else None)
949 info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
950 if info_dict.get('resolution') is None:
951 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
952
953 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
954 # of %(field)s to %(field)0Nd for backward compatibility
955 field_size_compat_map = {
956 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')),
957 'playlist_autonumber': len(str(info_dict.get('n_entries') or '')),
958 'autonumber': self.params.get('autonumber_size') or 5,
959 }
960
961 TMPL_DICT = {}
962 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]'))
963 MATH_FUNCTIONS = {
964 '+': float.__add__,
965 '-': float.__sub__,
966 }
967 # Field is of the form key1.key2...
968 # where keys (except first) can be string, int or slice
969 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
970 MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
971 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
972 INTERNAL_FORMAT_RE = re.compile(r'''(?x)
973 (?P<negate>-)?
974 (?P<fields>{field})
975 (?P<maths>(?:{math_op}{math_field})*)
976 (?:>(?P<strf_format>.+?))?
977 (?P<alternate>(?<!\\),[^|)]+)?
978 (?:\|(?P<default>.*?))?
979 $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
980
981 def _traverse_infodict(k):
982 k = k.split('.')
983 if k[0] == '':
984 k.pop(0)
985 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
986
987 def get_value(mdict):
988 # Object traversal
989 value = _traverse_infodict(mdict['fields'])
990 # Negative
991 if mdict['negate']:
992 value = float_or_none(value)
993 if value is not None:
994 value *= -1
995 # Do maths
996 offset_key = mdict['maths']
997 if offset_key:
998 value = float_or_none(value)
999 operator = None
1000 while offset_key:
1001 item = re.match(
1002 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1003 offset_key).group(0)
1004 offset_key = offset_key[len(item):]
1005 if operator is None:
1006 operator = MATH_FUNCTIONS[item]
1007 continue
1008 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1009 offset = float_or_none(item)
1010 if offset is None:
1011 offset = float_or_none(_traverse_infodict(item))
1012 try:
1013 value = operator(value, multiplier * offset)
1014 except (TypeError, ZeroDivisionError):
1015 return None
1016 operator = None
1017 # Datetime formatting
1018 if mdict['strf_format']:
1019 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1020
1021 return value
1022
1023 na = self.params.get('outtmpl_na_placeholder', 'NA')
1024
1025 def _dumpjson_default(obj):
1026 if isinstance(obj, (set, LazyList)):
1027 return list(obj)
1028 raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
1029
1030 def create_key(outer_mobj):
1031 if not outer_mobj.group('has_key'):
1032 return f'%{outer_mobj.group(0)}'
1033 key = outer_mobj.group('key')
1034 mobj = re.match(INTERNAL_FORMAT_RE, key)
1035 initial_field = mobj.group('fields').split('.')[-1] if mobj else ''
1036 value, default = None, na
1037 while mobj:
1038 mobj = mobj.groupdict()
1039 default = mobj['default'] if mobj['default'] is not None else default
1040 value = get_value(mobj)
1041 if value is None and mobj['alternate']:
1042 mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:])
1043 else:
1044 break
1045
1046 fmt = outer_mobj.group('format')
1047 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1048 fmt = '0{:d}d'.format(field_size_compat_map[key])
1049
1050 value = default if value is None else value
1051
1052 str_fmt = f'{fmt[:-1]}s'
1053 if fmt[-1] == 'l': # list
1054 delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', '
1055 value, fmt = delim.join(variadic(value)), str_fmt
1056 elif fmt[-1] == 'j': # json
1057 value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
1058 elif fmt[-1] == 'q': # quoted
1059 value, fmt = compat_shlex_quote(str(value)), str_fmt
1060 elif fmt[-1] == 'B': # bytes
1061 value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
1062 value, fmt = value.decode('utf-8', 'ignore'), 's'
1063 elif fmt[-1] == 'U': # unicode normalized
1064 opts = outer_mobj.group('conversion') or ''
1065 value, fmt = unicodedata.normalize(
1066 # "+" = compatibility equivalence, "#" = NFD
1067 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'),
1068 value), str_fmt
1069 elif fmt[-1] == 'c':
1070 if value:
1071 value = str(value)[0]
1072 else:
1073 fmt = str_fmt
1074 elif fmt[-1] not in 'rs': # numeric
1075 value = float_or_none(value)
1076 if value is None:
1077 value, fmt = default, 's'
1078
1079 if sanitize:
1080 if fmt[-1] == 'r':
1081 # If value is an object, sanitize might convert it to a string
1082 # So we convert it to repr first
1083 value, fmt = repr(value), str_fmt
1084 if fmt[-1] in 'csr':
1085 value = sanitize(initial_field, value)
1086
1087 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1088 TMPL_DICT[key] = value
1089 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1090
1091 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1092
1093 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1094 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1095 return self.escape_outtmpl(outtmpl) % info_dict
1096
1097 def _prepare_filename(self, info_dict, tmpl_type='default'):
1098 try:
1099 sanitize = lambda k, v: sanitize_filename(
1100 compat_str(v),
1101 restricted=self.params.get('restrictfilenames'),
1102 is_id=(k == 'id' or k.endswith('_id')))
1103 outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
1104 outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
1105 outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl))
1106 filename = outtmpl % template_dict
1107
1108 force_ext = OUTTMPL_TYPES.get(tmpl_type)
1109 if filename and force_ext is not None:
1110 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1111
1112 # https://github.com/blackjack4494/youtube-dlc/issues/85
1113 trim_file_name = self.params.get('trim_file_name', False)
1114 if trim_file_name:
1115 fn_groups = filename.rsplit('.')
1116 ext = fn_groups[-1]
1117 sub_ext = ''
1118 if len(fn_groups) > 2:
1119 sub_ext = fn_groups[-2]
1120 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
1121
1122 return filename
1123 except ValueError as err:
1124 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1125 return None
1126
1127 def prepare_filename(self, info_dict, dir_type='', warn=False):
1128 """Generate the output filename."""
1129
1130 filename = self._prepare_filename(info_dict, dir_type or 'default')
1131 if not filename and dir_type not in ('', 'temp'):
1132 return ''
1133
1134 if warn:
1135 if not self.params.get('paths'):
1136 pass
1137 elif filename == '-':
1138 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1139 elif os.path.isabs(filename):
1140 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1141 if filename == '-' or not filename:
1142 return filename
1143
1144 return self.get_output_path(dir_type, filename)
1145
1146 def _match_entry(self, info_dict, incomplete=False, silent=False):
1147 """ Returns None if the file should be downloaded """
1148
1149 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1150
1151 def check_filter():
1152 if 'title' in info_dict:
1153 # This can happen when we're just evaluating the playlist
1154 title = info_dict['title']
1155 matchtitle = self.params.get('matchtitle', False)
1156 if matchtitle:
1157 if not re.search(matchtitle, title, re.IGNORECASE):
1158 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1159 rejecttitle = self.params.get('rejecttitle', False)
1160 if rejecttitle:
1161 if re.search(rejecttitle, title, re.IGNORECASE):
1162 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1163 date = info_dict.get('upload_date')
1164 if date is not None:
1165 dateRange = self.params.get('daterange', DateRange())
1166 if date not in dateRange:
1167 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1168 view_count = info_dict.get('view_count')
1169 if view_count is not None:
1170 min_views = self.params.get('min_views')
1171 if min_views is not None and view_count < min_views:
1172 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1173 max_views = self.params.get('max_views')
1174 if max_views is not None and view_count > max_views:
1175 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1176 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1177 return 'Skipping "%s" because it is age restricted' % video_title
1178
1179 match_filter = self.params.get('match_filter')
1180 if match_filter is not None:
1181 try:
1182 ret = match_filter(info_dict, incomplete=incomplete)
1183 except TypeError:
1184 # For backward compatibility
1185 ret = None if incomplete else match_filter(info_dict)
1186 if ret is not None:
1187 return ret
1188 return None
1189
1190 if self.in_download_archive(info_dict):
1191 reason = '%s has already been recorded in the archive' % video_title
1192 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1193 else:
1194 reason = check_filter()
1195 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1196 if reason is not None:
1197 if not silent:
1198 self.to_screen('[download] ' + reason)
1199 if self.params.get(break_opt, False):
1200 raise break_err()
1201 return reason
1202
1203 @staticmethod
1204 def add_extra_info(info_dict, extra_info):
1205 '''Set the keys from extra_info in info dict if they are missing'''
1206 for key, value in extra_info.items():
1207 info_dict.setdefault(key, value)
1208
1209 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1210 process=True, force_generic_extractor=False):
1211 """
1212 Return a list with a dictionary for each video extracted.
1213
1214 Arguments:
1215 url -- URL to extract
1216
1217 Keyword arguments:
1218 download -- whether to download videos during extraction
1219 ie_key -- extractor key hint
1220 extra_info -- dictionary containing the extra values to add to each result
1221 process -- whether to resolve all unresolved references (URLs, playlist items),
1222 must be True for download to work.
1223 force_generic_extractor -- force using the generic extractor
1224 """
1225
1226 if extra_info is None:
1227 extra_info = {}
1228
1229 if not ie_key and force_generic_extractor:
1230 ie_key = 'Generic'
1231
1232 if ie_key:
1233 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1234 else:
1235 ies = self._ies
1236
1237 for ie_key, ie in ies.items():
1238 if not ie.suitable(url):
1239 continue
1240
1241 if not ie.working():
1242 self.report_warning('The program functionality for this site has been marked as broken, '
1243 'and will probably not work.')
1244
1245 temp_id = ie.get_temp_id(url)
1246 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1247 self.to_screen("[%s] %s: has already been recorded in archive" % (
1248 ie_key, temp_id))
1249 break
1250 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1251 else:
1252 self.report_error('no suitable InfoExtractor for URL %s' % url)
1253
1254 def __handle_extraction_exceptions(func):
1255
1256 def wrapper(self, *args, **kwargs):
1257 try:
1258 return func(self, *args, **kwargs)
1259 except GeoRestrictedError as e:
1260 msg = e.msg
1261 if e.countries:
1262 msg += '\nThis video is available in %s.' % ', '.join(
1263 map(ISO3166Utils.short2full, e.countries))
1264 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1265 self.report_error(msg)
1266 except ExtractorError as e: # An error we somewhat expected
1267 self.report_error(compat_str(e), e.format_traceback())
1268 except ThrottledDownload:
1269 self.to_stderr('\r')
1270 self.report_warning('The download speed is below throttle limit. Re-extracting data')
1271 return wrapper(self, *args, **kwargs)
1272 except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError):
1273 raise
1274 except Exception as e:
1275 if self.params.get('ignoreerrors'):
1276 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1277 else:
1278 raise
1279 return wrapper
1280
1281 @__handle_extraction_exceptions
1282 def __extract_info(self, url, ie, download, extra_info, process):
1283 ie_result = ie.extract(url)
1284 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1285 return
1286 if isinstance(ie_result, list):
1287 # Backwards compatibility: old IE result format
1288 ie_result = {
1289 '_type': 'compat_list',
1290 'entries': ie_result,
1291 }
1292 if extra_info.get('original_url'):
1293 ie_result.setdefault('original_url', extra_info['original_url'])
1294 self.add_default_extra_info(ie_result, ie, url)
1295 if process:
1296 return self.process_ie_result(ie_result, download, extra_info)
1297 else:
1298 return ie_result
1299
1300 def add_default_extra_info(self, ie_result, ie, url):
1301 if url is not None:
1302 self.add_extra_info(ie_result, {
1303 'webpage_url': url,
1304 'original_url': url,
1305 'webpage_url_basename': url_basename(url),
1306 })
1307 if ie is not None:
1308 self.add_extra_info(ie_result, {
1309 'extractor': ie.IE_NAME,
1310 'extractor_key': ie.ie_key(),
1311 })
1312
1313 def process_ie_result(self, ie_result, download=True, extra_info=None):
1314 """
1315 Take the result of the ie(may be modified) and resolve all unresolved
1316 references (URLs, playlist items).
1317
1318 It will also download the videos if 'download'.
1319 Returns the resolved ie_result.
1320 """
1321 if extra_info is None:
1322 extra_info = {}
1323 result_type = ie_result.get('_type', 'video')
1324
1325 if result_type in ('url', 'url_transparent'):
1326 ie_result['url'] = sanitize_url(ie_result['url'])
1327 if ie_result.get('original_url'):
1328 extra_info.setdefault('original_url', ie_result['original_url'])
1329
1330 extract_flat = self.params.get('extract_flat', False)
1331 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1332 or extract_flat is True):
1333 info_copy = ie_result.copy()
1334 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1335 if ie and not ie_result.get('id'):
1336 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1337 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1338 self.add_extra_info(info_copy, extra_info)
1339 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1340 if self.params.get('force_write_download_archive', False):
1341 self.record_download_archive(info_copy)
1342 return ie_result
1343
1344 if result_type == 'video':
1345 self.add_extra_info(ie_result, extra_info)
1346 ie_result = self.process_video_result(ie_result, download=download)
1347 additional_urls = (ie_result or {}).get('additional_urls')
1348 if additional_urls:
1349 # TODO: Improve MetadataParserPP to allow setting a list
1350 if isinstance(additional_urls, compat_str):
1351 additional_urls = [additional_urls]
1352 self.to_screen(
1353 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1354 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1355 ie_result['additional_entries'] = [
1356 self.extract_info(
1357 url, download, extra_info,
1358 force_generic_extractor=self.params.get('force_generic_extractor'))
1359 for url in additional_urls
1360 ]
1361 return ie_result
1362 elif result_type == 'url':
1363 # We have to add extra_info to the results because it may be
1364 # contained in a playlist
1365 return self.extract_info(
1366 ie_result['url'], download,
1367 ie_key=ie_result.get('ie_key'),
1368 extra_info=extra_info)
1369 elif result_type == 'url_transparent':
1370 # Use the information from the embedding page
1371 info = self.extract_info(
1372 ie_result['url'], ie_key=ie_result.get('ie_key'),
1373 extra_info=extra_info, download=False, process=False)
1374
1375 # extract_info may return None when ignoreerrors is enabled and
1376 # extraction failed with an error, don't crash and return early
1377 # in this case
1378 if not info:
1379 return info
1380
1381 force_properties = dict(
1382 (k, v) for k, v in ie_result.items() if v is not None)
1383 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1384 if f in force_properties:
1385 del force_properties[f]
1386 new_result = info.copy()
1387 new_result.update(force_properties)
1388
1389 # Extracted info may not be a video result (i.e.
1390 # info.get('_type', 'video') != video) but rather an url or
1391 # url_transparent. In such cases outer metadata (from ie_result)
1392 # should be propagated to inner one (info). For this to happen
1393 # _type of info should be overridden with url_transparent. This
1394 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1395 if new_result.get('_type') == 'url':
1396 new_result['_type'] = 'url_transparent'
1397
1398 return self.process_ie_result(
1399 new_result, download=download, extra_info=extra_info)
1400 elif result_type in ('playlist', 'multi_video'):
1401 # Protect from infinite recursion due to recursively nested playlists
1402 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1403 webpage_url = ie_result['webpage_url']
1404 if webpage_url in self._playlist_urls:
1405 self.to_screen(
1406 '[download] Skipping already downloaded playlist: %s'
1407 % ie_result.get('title') or ie_result.get('id'))
1408 return
1409
1410 self._playlist_level += 1
1411 self._playlist_urls.add(webpage_url)
1412 self._sanitize_thumbnails(ie_result)
1413 try:
1414 return self.__process_playlist(ie_result, download)
1415 finally:
1416 self._playlist_level -= 1
1417 if not self._playlist_level:
1418 self._playlist_urls.clear()
1419 elif result_type == 'compat_list':
1420 self.report_warning(
1421 'Extractor %s returned a compat_list result. '
1422 'It needs to be updated.' % ie_result.get('extractor'))
1423
1424 def _fixup(r):
1425 self.add_extra_info(r, {
1426 'extractor': ie_result['extractor'],
1427 'webpage_url': ie_result['webpage_url'],
1428 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1429 'extractor_key': ie_result['extractor_key'],
1430 })
1431 return r
1432 ie_result['entries'] = [
1433 self.process_ie_result(_fixup(r), download, extra_info)
1434 for r in ie_result['entries']
1435 ]
1436 return ie_result
1437 else:
1438 raise Exception('Invalid result type: %s' % result_type)
1439
1440 def _ensure_dir_exists(self, path):
1441 return make_dir(path, self.report_error)
1442
1443 def __process_playlist(self, ie_result, download):
1444 # We process each entry in the playlist
1445 playlist = ie_result.get('title') or ie_result.get('id')
1446 self.to_screen('[download] Downloading playlist: %s' % playlist)
1447
1448 if 'entries' not in ie_result:
1449 raise EntryNotInPlaylist()
1450 incomplete_entries = bool(ie_result.get('requested_entries'))
1451 if incomplete_entries:
1452 def fill_missing_entries(entries, indexes):
1453 ret = [None] * max(*indexes)
1454 for i, entry in zip(indexes, entries):
1455 ret[i - 1] = entry
1456 return ret
1457 ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1458
1459 playlist_results = []
1460
1461 playliststart = self.params.get('playliststart', 1)
1462 playlistend = self.params.get('playlistend')
1463 # For backwards compatibility, interpret -1 as whole list
1464 if playlistend == -1:
1465 playlistend = None
1466
1467 playlistitems_str = self.params.get('playlist_items')
1468 playlistitems = None
1469 if playlistitems_str is not None:
1470 def iter_playlistitems(format):
1471 for string_segment in format.split(','):
1472 if '-' in string_segment:
1473 start, end = string_segment.split('-')
1474 for item in range(int(start), int(end) + 1):
1475 yield int(item)
1476 else:
1477 yield int(string_segment)
1478 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1479
1480 ie_entries = ie_result['entries']
1481 msg = (
1482 'Downloading %d videos' if not isinstance(ie_entries, list)
1483 else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1484
1485 if isinstance(ie_entries, list):
1486 def get_entry(i):
1487 return ie_entries[i - 1]
1488 else:
1489 if not isinstance(ie_entries, PagedList):
1490 ie_entries = LazyList(ie_entries)
1491
1492 def get_entry(i):
1493 return YoutubeDL.__handle_extraction_exceptions(
1494 lambda self, i: ie_entries[i - 1]
1495 )(self, i)
1496
1497 entries = []
1498 items = playlistitems if playlistitems is not None else itertools.count(playliststart)
1499 for i in items:
1500 if i == 0:
1501 continue
1502 if playlistitems is None and playlistend is not None and playlistend < i:
1503 break
1504 entry = None
1505 try:
1506 entry = get_entry(i)
1507 if entry is None:
1508 raise EntryNotInPlaylist()
1509 except (IndexError, EntryNotInPlaylist):
1510 if incomplete_entries:
1511 raise EntryNotInPlaylist()
1512 elif not playlistitems:
1513 break
1514 entries.append(entry)
1515 try:
1516 if entry is not None:
1517 self._match_entry(entry, incomplete=True, silent=True)
1518 except (ExistingVideoReached, RejectedVideoReached):
1519 break
1520 ie_result['entries'] = entries
1521
1522 # Save playlist_index before re-ordering
1523 entries = [
1524 ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry)
1525 for i, entry in enumerate(entries, 1)
1526 if entry is not None]
1527 n_entries = len(entries)
1528
1529 if not playlistitems and (playliststart or playlistend):
1530 playlistitems = list(range(playliststart, playliststart + n_entries))
1531 ie_result['requested_entries'] = playlistitems
1532
1533 if self.params.get('allow_playlist_files', True):
1534 ie_copy = {
1535 'playlist': playlist,
1536 'playlist_id': ie_result.get('id'),
1537 'playlist_title': ie_result.get('title'),
1538 'playlist_uploader': ie_result.get('uploader'),
1539 'playlist_uploader_id': ie_result.get('uploader_id'),
1540 'playlist_index': 0,
1541 }
1542 ie_copy.update(dict(ie_result))
1543
1544 if self._write_info_json('playlist', ie_result,
1545 self.prepare_filename(ie_copy, 'pl_infojson')) is None:
1546 return
1547 if self._write_description('playlist', ie_result,
1548 self.prepare_filename(ie_copy, 'pl_description')) is None:
1549 return
1550 # TODO: This should be passed to ThumbnailsConvertor if necessary
1551 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1552
1553 if self.params.get('playlistreverse', False):
1554 entries = entries[::-1]
1555 if self.params.get('playlistrandom', False):
1556 random.shuffle(entries)
1557
1558 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1559
1560 self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1561 failures = 0
1562 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1563 for i, entry_tuple in enumerate(entries, 1):
1564 playlist_index, entry = entry_tuple
1565 if 'playlist-index' in self.params.get('compat_opts', []):
1566 playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1
1567 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1568 # This __x_forwarded_for_ip thing is a bit ugly but requires
1569 # minimal changes
1570 if x_forwarded_for:
1571 entry['__x_forwarded_for_ip'] = x_forwarded_for
1572 extra = {
1573 'n_entries': n_entries,
1574 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1575 'playlist_index': playlist_index,
1576 'playlist_autonumber': i,
1577 'playlist': playlist,
1578 'playlist_id': ie_result.get('id'),
1579 'playlist_title': ie_result.get('title'),
1580 'playlist_uploader': ie_result.get('uploader'),
1581 'playlist_uploader_id': ie_result.get('uploader_id'),
1582 'extractor': ie_result['extractor'],
1583 'webpage_url': ie_result['webpage_url'],
1584 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1585 'extractor_key': ie_result['extractor_key'],
1586 }
1587
1588 if self._match_entry(entry, incomplete=True) is not None:
1589 continue
1590
1591 entry_result = self.__process_iterable_entry(entry, download, extra)
1592 if not entry_result:
1593 failures += 1
1594 if failures >= max_failures:
1595 self.report_error(
1596 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1597 break
1598 # TODO: skip failed (empty) entries?
1599 playlist_results.append(entry_result)
1600 ie_result['entries'] = playlist_results
1601 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1602 return ie_result
1603
1604 @__handle_extraction_exceptions
1605 def __process_iterable_entry(self, entry, download, extra_info):
1606 return self.process_ie_result(
1607 entry, download=download, extra_info=extra_info)
1608
1609 def _build_format_filter(self, filter_spec):
1610 " Returns a function to filter the formats according to the filter_spec "
1611
1612 OPERATORS = {
1613 '<': operator.lt,
1614 '<=': operator.le,
1615 '>': operator.gt,
1616 '>=': operator.ge,
1617 '=': operator.eq,
1618 '!=': operator.ne,
1619 }
1620 operator_rex = re.compile(r'''(?x)\s*
1621 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1622 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1623 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1624 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1625 m = operator_rex.fullmatch(filter_spec)
1626 if m:
1627 try:
1628 comparison_value = int(m.group('value'))
1629 except ValueError:
1630 comparison_value = parse_filesize(m.group('value'))
1631 if comparison_value is None:
1632 comparison_value = parse_filesize(m.group('value') + 'B')
1633 if comparison_value is None:
1634 raise ValueError(
1635 'Invalid value %r in format specification %r' % (
1636 m.group('value'), filter_spec))
1637 op = OPERATORS[m.group('op')]
1638
1639 if not m:
1640 STR_OPERATORS = {
1641 '=': operator.eq,
1642 '^=': lambda attr, value: attr.startswith(value),
1643 '$=': lambda attr, value: attr.endswith(value),
1644 '*=': lambda attr, value: value in attr,
1645 }
1646 str_operator_rex = re.compile(r'''(?x)\s*
1647 (?P<key>[a-zA-Z0-9._-]+)\s*
1648 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1649 (?P<value>[a-zA-Z0-9._-]+)\s*
1650 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1651 m = str_operator_rex.fullmatch(filter_spec)
1652 if m:
1653 comparison_value = m.group('value')
1654 str_op = STR_OPERATORS[m.group('op')]
1655 if m.group('negation'):
1656 op = lambda attr, value: not str_op(attr, value)
1657 else:
1658 op = str_op
1659
1660 if not m:
1661 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1662
1663 def _filter(f):
1664 actual_value = f.get(m.group('key'))
1665 if actual_value is None:
1666 return m.group('none_inclusive')
1667 return op(actual_value, comparison_value)
1668 return _filter
1669
1670 def _default_format_spec(self, info_dict, download=True):
1671
1672 def can_merge():
1673 merger = FFmpegMergerPP(self)
1674 return merger.available and merger.can_merge()
1675
1676 prefer_best = (
1677 not self.params.get('simulate')
1678 and download
1679 and (
1680 not can_merge()
1681 or info_dict.get('is_live', False)
1682 or self.outtmpl_dict['default'] == '-'))
1683 compat = (
1684 prefer_best
1685 or self.params.get('allow_multiple_audio_streams', False)
1686 or 'format-spec' in self.params.get('compat_opts', []))
1687
1688 return (
1689 'best/bestvideo+bestaudio' if prefer_best
1690 else 'bestvideo*+bestaudio/best' if not compat
1691 else 'bestvideo+bestaudio/best')
1692
1693 def build_format_selector(self, format_spec):
1694 def syntax_error(note, start):
1695 message = (
1696 'Invalid format specification: '
1697 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1698 return SyntaxError(message)
1699
1700 PICKFIRST = 'PICKFIRST'
1701 MERGE = 'MERGE'
1702 SINGLE = 'SINGLE'
1703 GROUP = 'GROUP'
1704 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1705
1706 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1707 'video': self.params.get('allow_multiple_video_streams', False)}
1708
1709 check_formats = self.params.get('check_formats')
1710
1711 def _parse_filter(tokens):
1712 filter_parts = []
1713 for type, string, start, _, _ in tokens:
1714 if type == tokenize.OP and string == ']':
1715 return ''.join(filter_parts)
1716 else:
1717 filter_parts.append(string)
1718
1719 def _remove_unused_ops(tokens):
1720 # Remove operators that we don't use and join them with the surrounding strings
1721 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1722 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1723 last_string, last_start, last_end, last_line = None, None, None, None
1724 for type, string, start, end, line in tokens:
1725 if type == tokenize.OP and string == '[':
1726 if last_string:
1727 yield tokenize.NAME, last_string, last_start, last_end, last_line
1728 last_string = None
1729 yield type, string, start, end, line
1730 # everything inside brackets will be handled by _parse_filter
1731 for type, string, start, end, line in tokens:
1732 yield type, string, start, end, line
1733 if type == tokenize.OP and string == ']':
1734 break
1735 elif type == tokenize.OP and string in ALLOWED_OPS:
1736 if last_string:
1737 yield tokenize.NAME, last_string, last_start, last_end, last_line
1738 last_string = None
1739 yield type, string, start, end, line
1740 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1741 if not last_string:
1742 last_string = string
1743 last_start = start
1744 last_end = end
1745 else:
1746 last_string += string
1747 if last_string:
1748 yield tokenize.NAME, last_string, last_start, last_end, last_line
1749
1750 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1751 selectors = []
1752 current_selector = None
1753 for type, string, start, _, _ in tokens:
1754 # ENCODING is only defined in python 3.x
1755 if type == getattr(tokenize, 'ENCODING', None):
1756 continue
1757 elif type in [tokenize.NAME, tokenize.NUMBER]:
1758 current_selector = FormatSelector(SINGLE, string, [])
1759 elif type == tokenize.OP:
1760 if string == ')':
1761 if not inside_group:
1762 # ')' will be handled by the parentheses group
1763 tokens.restore_last_token()
1764 break
1765 elif inside_merge and string in ['/', ',']:
1766 tokens.restore_last_token()
1767 break
1768 elif inside_choice and string == ',':
1769 tokens.restore_last_token()
1770 break
1771 elif string == ',':
1772 if not current_selector:
1773 raise syntax_error('"," must follow a format selector', start)
1774 selectors.append(current_selector)
1775 current_selector = None
1776 elif string == '/':
1777 if not current_selector:
1778 raise syntax_error('"/" must follow a format selector', start)
1779 first_choice = current_selector
1780 second_choice = _parse_format_selection(tokens, inside_choice=True)
1781 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1782 elif string == '[':
1783 if not current_selector:
1784 current_selector = FormatSelector(SINGLE, 'best', [])
1785 format_filter = _parse_filter(tokens)
1786 current_selector.filters.append(format_filter)
1787 elif string == '(':
1788 if current_selector:
1789 raise syntax_error('Unexpected "("', start)
1790 group = _parse_format_selection(tokens, inside_group=True)
1791 current_selector = FormatSelector(GROUP, group, [])
1792 elif string == '+':
1793 if not current_selector:
1794 raise syntax_error('Unexpected "+"', start)
1795 selector_1 = current_selector
1796 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1797 if not selector_2:
1798 raise syntax_error('Expected a selector', start)
1799 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1800 else:
1801 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1802 elif type == tokenize.ENDMARKER:
1803 break
1804 if current_selector:
1805 selectors.append(current_selector)
1806 return selectors
1807
1808 def _merge(formats_pair):
1809 format_1, format_2 = formats_pair
1810
1811 formats_info = []
1812 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1813 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1814
1815 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1816 get_no_more = {'video': False, 'audio': False}
1817 for (i, fmt_info) in enumerate(formats_info):
1818 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1819 formats_info.pop(i)
1820 continue
1821 for aud_vid in ['audio', 'video']:
1822 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1823 if get_no_more[aud_vid]:
1824 formats_info.pop(i)
1825 break
1826 get_no_more[aud_vid] = True
1827
1828 if len(formats_info) == 1:
1829 return formats_info[0]
1830
1831 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1832 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1833
1834 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1835 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1836
1837 output_ext = self.params.get('merge_output_format')
1838 if not output_ext:
1839 if the_only_video:
1840 output_ext = the_only_video['ext']
1841 elif the_only_audio and not video_fmts:
1842 output_ext = the_only_audio['ext']
1843 else:
1844 output_ext = 'mkv'
1845
1846 new_dict = {
1847 'requested_formats': formats_info,
1848 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1849 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1850 'ext': output_ext,
1851 }
1852
1853 if the_only_video:
1854 new_dict.update({
1855 'width': the_only_video.get('width'),
1856 'height': the_only_video.get('height'),
1857 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
1858 'fps': the_only_video.get('fps'),
1859 'vcodec': the_only_video.get('vcodec'),
1860 'vbr': the_only_video.get('vbr'),
1861 'stretched_ratio': the_only_video.get('stretched_ratio'),
1862 })
1863
1864 if the_only_audio:
1865 new_dict.update({
1866 'acodec': the_only_audio.get('acodec'),
1867 'abr': the_only_audio.get('abr'),
1868 })
1869
1870 return new_dict
1871
1872 def _check_formats(formats):
1873 if not check_formats:
1874 yield from formats
1875 return
1876 for f in formats:
1877 self.to_screen('[info] Testing format %s' % f['format_id'])
1878 temp_file = tempfile.NamedTemporaryFile(
1879 suffix='.tmp', delete=False,
1880 dir=self.get_output_path('temp') or None)
1881 temp_file.close()
1882 try:
1883 success, _ = self.dl(temp_file.name, f, test=True)
1884 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1885 success = False
1886 finally:
1887 if os.path.exists(temp_file.name):
1888 try:
1889 os.remove(temp_file.name)
1890 except OSError:
1891 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1892 if success:
1893 yield f
1894 else:
1895 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1896
1897 def _build_selector_function(selector):
1898 if isinstance(selector, list): # ,
1899 fs = [_build_selector_function(s) for s in selector]
1900
1901 def selector_function(ctx):
1902 for f in fs:
1903 yield from f(ctx)
1904 return selector_function
1905
1906 elif selector.type == GROUP: # ()
1907 selector_function = _build_selector_function(selector.selector)
1908
1909 elif selector.type == PICKFIRST: # /
1910 fs = [_build_selector_function(s) for s in selector.selector]
1911
1912 def selector_function(ctx):
1913 for f in fs:
1914 picked_formats = list(f(ctx))
1915 if picked_formats:
1916 return picked_formats
1917 return []
1918
1919 elif selector.type == MERGE: # +
1920 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1921
1922 def selector_function(ctx):
1923 for pair in itertools.product(
1924 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1925 yield _merge(pair)
1926
1927 elif selector.type == SINGLE: # atom
1928 format_spec = selector.selector or 'best'
1929
1930 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
1931 if format_spec == 'all':
1932 def selector_function(ctx):
1933 yield from _check_formats(ctx['formats'])
1934 elif format_spec == 'mergeall':
1935 def selector_function(ctx):
1936 formats = list(_check_formats(ctx['formats']))
1937 if not formats:
1938 return
1939 merged_format = formats[-1]
1940 for f in formats[-2::-1]:
1941 merged_format = _merge((merged_format, f))
1942 yield merged_format
1943
1944 else:
1945 format_fallback, format_reverse, format_idx = False, True, 1
1946 mobj = re.match(
1947 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
1948 format_spec)
1949 if mobj is not None:
1950 format_idx = int_or_none(mobj.group('n'), default=1)
1951 format_reverse = mobj.group('bw')[0] == 'b'
1952 format_type = (mobj.group('type') or [None])[0]
1953 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
1954 format_modified = mobj.group('mod') is not None
1955
1956 format_fallback = not format_type and not format_modified # for b, w
1957 _filter_f = (
1958 (lambda f: f.get('%scodec' % format_type) != 'none')
1959 if format_type and format_modified # bv*, ba*, wv*, wa*
1960 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
1961 if format_type # bv, ba, wv, wa
1962 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1963 if not format_modified # b, w
1964 else lambda f: True) # b*, w*
1965 filter_f = lambda f: _filter_f(f) and (
1966 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
1967 else:
1968 if format_spec in ('m4a', 'mp3', 'ogg', 'aac'): # audio extension
1969 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
1970 elif format_spec in ('mp4', 'flv', 'webm', '3gp'): # video extension
1971 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
1972 elif format_spec in ('mhtml', ): # storyboards extension
1973 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
1974 else:
1975 filter_f = (lambda f: f.get('format_id') == format_spec) # id
1976
1977 def selector_function(ctx):
1978 formats = list(ctx['formats'])
1979 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1980 if format_fallback and ctx['incomplete_formats'] and not matches:
1981 # for extractors with incomplete formats (audio only (soundcloud)
1982 # or video only (imgur)) best/worst will fallback to
1983 # best/worst {video,audio}-only format
1984 matches = formats
1985 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
1986 try:
1987 yield matches[format_idx - 1]
1988 except IndexError:
1989 return
1990
1991 filters = [self._build_format_filter(f) for f in selector.filters]
1992
1993 def final_selector(ctx):
1994 ctx_copy = copy.deepcopy(ctx)
1995 for _filter in filters:
1996 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1997 return selector_function(ctx_copy)
1998 return final_selector
1999
2000 stream = io.BytesIO(format_spec.encode('utf-8'))
2001 try:
2002 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
2003 except tokenize.TokenError:
2004 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2005
2006 class TokenIterator(object):
2007 def __init__(self, tokens):
2008 self.tokens = tokens
2009 self.counter = 0
2010
2011 def __iter__(self):
2012 return self
2013
2014 def __next__(self):
2015 if self.counter >= len(self.tokens):
2016 raise StopIteration()
2017 value = self.tokens[self.counter]
2018 self.counter += 1
2019 return value
2020
2021 next = __next__
2022
2023 def restore_last_token(self):
2024 self.counter -= 1
2025
2026 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2027 return _build_selector_function(parsed_selector)
2028
2029 def _calc_headers(self, info_dict):
2030 res = std_headers.copy()
2031
2032 add_headers = info_dict.get('http_headers')
2033 if add_headers:
2034 res.update(add_headers)
2035
2036 cookies = self._calc_cookies(info_dict)
2037 if cookies:
2038 res['Cookie'] = cookies
2039
2040 if 'X-Forwarded-For' not in res:
2041 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2042 if x_forwarded_for_ip:
2043 res['X-Forwarded-For'] = x_forwarded_for_ip
2044
2045 return res
2046
2047 def _calc_cookies(self, info_dict):
2048 pr = sanitized_Request(info_dict['url'])
2049 self.cookiejar.add_cookie_header(pr)
2050 return pr.get_header('Cookie')
2051
2052 def _sanitize_thumbnails(self, info_dict):
2053 thumbnails = info_dict.get('thumbnails')
2054 if thumbnails is None:
2055 thumbnail = info_dict.get('thumbnail')
2056 if thumbnail:
2057 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2058 if thumbnails:
2059 thumbnails.sort(key=lambda t: (
2060 t.get('preference') if t.get('preference') is not None else -1,
2061 t.get('width') if t.get('width') is not None else -1,
2062 t.get('height') if t.get('height') is not None else -1,
2063 t.get('id') if t.get('id') is not None else '',
2064 t.get('url')))
2065
2066 def thumbnail_tester():
2067 if self.params.get('check_formats'):
2068 test_all = True
2069 to_screen = lambda msg: self.to_screen(f'[info] {msg}')
2070 else:
2071 test_all = False
2072 to_screen = self.write_debug
2073
2074 def test_thumbnail(t):
2075 if not test_all and not t.get('_test_url'):
2076 return True
2077 to_screen('Testing thumbnail %s' % t['id'])
2078 try:
2079 self.urlopen(HEADRequest(t['url']))
2080 except network_exceptions as err:
2081 to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % (
2082 t['id'], t['url'], error_to_compat_str(err)))
2083 return False
2084 return True
2085
2086 return test_thumbnail
2087
2088 for i, t in enumerate(thumbnails):
2089 if t.get('id') is None:
2090 t['id'] = '%d' % i
2091 if t.get('width') and t.get('height'):
2092 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2093 t['url'] = sanitize_url(t['url'])
2094
2095 if self.params.get('check_formats') is not False:
2096 info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse()
2097 else:
2098 info_dict['thumbnails'] = thumbnails
2099
2100 def process_video_result(self, info_dict, download=True):
2101 assert info_dict.get('_type', 'video') == 'video'
2102
2103 if 'id' not in info_dict:
2104 raise ExtractorError('Missing "id" field in extractor result')
2105 if 'title' not in info_dict:
2106 raise ExtractorError('Missing "title" field in extractor result',
2107 video_id=info_dict['id'], ie=info_dict['extractor'])
2108
2109 def report_force_conversion(field, field_not, conversion):
2110 self.report_warning(
2111 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2112 % (field, field_not, conversion))
2113
2114 def sanitize_string_field(info, string_field):
2115 field = info.get(string_field)
2116 if field is None or isinstance(field, compat_str):
2117 return
2118 report_force_conversion(string_field, 'a string', 'string')
2119 info[string_field] = compat_str(field)
2120
2121 def sanitize_numeric_fields(info):
2122 for numeric_field in self._NUMERIC_FIELDS:
2123 field = info.get(numeric_field)
2124 if field is None or isinstance(field, compat_numeric_types):
2125 continue
2126 report_force_conversion(numeric_field, 'numeric', 'int')
2127 info[numeric_field] = int_or_none(field)
2128
2129 sanitize_string_field(info_dict, 'id')
2130 sanitize_numeric_fields(info_dict)
2131
2132 if 'playlist' not in info_dict:
2133 # It isn't part of a playlist
2134 info_dict['playlist'] = None
2135 info_dict['playlist_index'] = None
2136
2137 self._sanitize_thumbnails(info_dict)
2138
2139 thumbnail = info_dict.get('thumbnail')
2140 thumbnails = info_dict.get('thumbnails')
2141 if thumbnail:
2142 info_dict['thumbnail'] = sanitize_url(thumbnail)
2143 elif thumbnails:
2144 info_dict['thumbnail'] = thumbnails[-1]['url']
2145
2146 if info_dict.get('display_id') is None and 'id' in info_dict:
2147 info_dict['display_id'] = info_dict['id']
2148
2149 for ts_key, date_key in (
2150 ('timestamp', 'upload_date'),
2151 ('release_timestamp', 'release_date'),
2152 ):
2153 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2154 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2155 # see http://bugs.python.org/issue1646728)
2156 try:
2157 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2158 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2159 except (ValueError, OverflowError, OSError):
2160 pass
2161
2162 live_keys = ('is_live', 'was_live')
2163 live_status = info_dict.get('live_status')
2164 if live_status is None:
2165 for key in live_keys:
2166 if info_dict.get(key) is False:
2167 continue
2168 if info_dict.get(key):
2169 live_status = key
2170 break
2171 if all(info_dict.get(key) is False for key in live_keys):
2172 live_status = 'not_live'
2173 if live_status:
2174 info_dict['live_status'] = live_status
2175 for key in live_keys:
2176 if info_dict.get(key) is None:
2177 info_dict[key] = (live_status == key)
2178
2179 # Auto generate title fields corresponding to the *_number fields when missing
2180 # in order to always have clean titles. This is very common for TV series.
2181 for field in ('chapter', 'season', 'episode'):
2182 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2183 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2184
2185 for cc_kind in ('subtitles', 'automatic_captions'):
2186 cc = info_dict.get(cc_kind)
2187 if cc:
2188 for _, subtitle in cc.items():
2189 for subtitle_format in subtitle:
2190 if subtitle_format.get('url'):
2191 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2192 if subtitle_format.get('ext') is None:
2193 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2194
2195 automatic_captions = info_dict.get('automatic_captions')
2196 subtitles = info_dict.get('subtitles')
2197
2198 info_dict['requested_subtitles'] = self.process_subtitles(
2199 info_dict['id'], subtitles, automatic_captions)
2200
2201 # We now pick which formats have to be downloaded
2202 if info_dict.get('formats') is None:
2203 # There's only one format available
2204 formats = [info_dict]
2205 else:
2206 formats = info_dict['formats']
2207
2208 info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
2209 if not self.params.get('allow_unplayable_formats'):
2210 formats = [f for f in formats if not f.get('has_drm')]
2211
2212 if not formats:
2213 self.raise_no_formats(info_dict)
2214
2215 def is_wellformed(f):
2216 url = f.get('url')
2217 if not url:
2218 self.report_warning(
2219 '"url" field is missing or empty - skipping format, '
2220 'there is an error in extractor')
2221 return False
2222 if isinstance(url, bytes):
2223 sanitize_string_field(f, 'url')
2224 return True
2225
2226 # Filter out malformed formats for better extraction robustness
2227 formats = list(filter(is_wellformed, formats))
2228
2229 formats_dict = {}
2230
2231 # We check that all the formats have the format and format_id fields
2232 for i, format in enumerate(formats):
2233 sanitize_string_field(format, 'format_id')
2234 sanitize_numeric_fields(format)
2235 format['url'] = sanitize_url(format['url'])
2236 if not format.get('format_id'):
2237 format['format_id'] = compat_str(i)
2238 else:
2239 # Sanitize format_id from characters used in format selector expression
2240 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2241 format_id = format['format_id']
2242 if format_id not in formats_dict:
2243 formats_dict[format_id] = []
2244 formats_dict[format_id].append(format)
2245
2246 # Make sure all formats have unique format_id
2247 for format_id, ambiguous_formats in formats_dict.items():
2248 if len(ambiguous_formats) > 1:
2249 for i, format in enumerate(ambiguous_formats):
2250 format['format_id'] = '%s-%d' % (format_id, i)
2251
2252 for i, format in enumerate(formats):
2253 if format.get('format') is None:
2254 format['format'] = '{id} - {res}{note}'.format(
2255 id=format['format_id'],
2256 res=self.format_resolution(format),
2257 note=format_field(format, 'format_note', ' (%s)'),
2258 )
2259 # Automatically determine file extension if missing
2260 if format.get('ext') is None:
2261 format['ext'] = determine_ext(format['url']).lower()
2262 # Automatically determine protocol if missing (useful for format
2263 # selection purposes)
2264 if format.get('protocol') is None:
2265 format['protocol'] = determine_protocol(format)
2266 # Add HTTP headers, so that external programs can use them from the
2267 # json output
2268 full_format_info = info_dict.copy()
2269 full_format_info.update(format)
2270 format['http_headers'] = self._calc_headers(full_format_info)
2271 # Remove private housekeeping stuff
2272 if '__x_forwarded_for_ip' in info_dict:
2273 del info_dict['__x_forwarded_for_ip']
2274
2275 # TODO Central sorting goes here
2276
2277 if not formats or formats[0] is not info_dict:
2278 # only set the 'formats' fields if the original info_dict list them
2279 # otherwise we end up with a circular reference, the first (and unique)
2280 # element in the 'formats' field in info_dict is info_dict itself,
2281 # which can't be exported to json
2282 info_dict['formats'] = formats
2283
2284 info_dict, _ = self.pre_process(info_dict)
2285
2286 if self.params.get('list_thumbnails'):
2287 self.list_thumbnails(info_dict)
2288 if self.params.get('listformats'):
2289 if not info_dict.get('formats') and not info_dict.get('url'):
2290 self.to_screen('%s has no formats' % info_dict['id'])
2291 else:
2292 self.list_formats(info_dict)
2293 if self.params.get('listsubtitles'):
2294 if 'automatic_captions' in info_dict:
2295 self.list_subtitles(
2296 info_dict['id'], automatic_captions, 'automatic captions')
2297 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2298 list_only = self.params.get('simulate') is None and (
2299 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2300 if list_only:
2301 # Without this printing, -F --print-json will not work
2302 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2303 return
2304
2305 format_selector = self.format_selector
2306 if format_selector is None:
2307 req_format = self._default_format_spec(info_dict, download=download)
2308 self.write_debug('Default format spec: %s' % req_format)
2309 format_selector = self.build_format_selector(req_format)
2310
2311 # While in format selection we may need to have an access to the original
2312 # format set in order to calculate some metrics or do some processing.
2313 # For now we need to be able to guess whether original formats provided
2314 # by extractor are incomplete or not (i.e. whether extractor provides only
2315 # video-only or audio-only formats) for proper formats selection for
2316 # extractors with such incomplete formats (see
2317 # https://github.com/ytdl-org/youtube-dl/pull/5556).
2318 # Since formats may be filtered during format selection and may not match
2319 # the original formats the results may be incorrect. Thus original formats
2320 # or pre-calculated metrics should be passed to format selection routines
2321 # as well.
2322 # We will pass a context object containing all necessary additional data
2323 # instead of just formats.
2324 # This fixes incorrect format selection issue (see
2325 # https://github.com/ytdl-org/youtube-dl/issues/10083).
2326 incomplete_formats = (
2327 # All formats are video-only or
2328 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2329 # all formats are audio-only
2330 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2331
2332 ctx = {
2333 'formats': formats,
2334 'incomplete_formats': incomplete_formats,
2335 }
2336
2337 formats_to_download = list(format_selector(ctx))
2338 if not formats_to_download:
2339 if not self.params.get('ignore_no_formats_error'):
2340 raise ExtractorError('Requested format is not available', expected=True,
2341 video_id=info_dict['id'], ie=info_dict['extractor'])
2342 else:
2343 self.report_warning('Requested format is not available')
2344 # Process what we can, even without any available formats.
2345 self.process_info(dict(info_dict))
2346 elif download:
2347 self.to_screen(
2348 '[info] %s: Downloading %d format(s): %s' % (
2349 info_dict['id'], len(formats_to_download),
2350 ", ".join([f['format_id'] for f in formats_to_download])))
2351 for fmt in formats_to_download:
2352 new_info = dict(info_dict)
2353 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2354 new_info['__original_infodict'] = info_dict
2355 new_info.update(fmt)
2356 self.process_info(new_info)
2357 # We update the info dict with the best quality format (backwards compatibility)
2358 if formats_to_download:
2359 info_dict.update(formats_to_download[-1])
2360 return info_dict
2361
2362 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2363 """Select the requested subtitles and their format"""
2364 available_subs = {}
2365 if normal_subtitles and self.params.get('writesubtitles'):
2366 available_subs.update(normal_subtitles)
2367 if automatic_captions and self.params.get('writeautomaticsub'):
2368 for lang, cap_info in automatic_captions.items():
2369 if lang not in available_subs:
2370 available_subs[lang] = cap_info
2371
2372 if (not self.params.get('writesubtitles') and not
2373 self.params.get('writeautomaticsub') or not
2374 available_subs):
2375 return None
2376
2377 all_sub_langs = available_subs.keys()
2378 if self.params.get('allsubtitles', False):
2379 requested_langs = all_sub_langs
2380 elif self.params.get('subtitleslangs', False):
2381 # A list is used so that the order of languages will be the same as
2382 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2383 requested_langs = []
2384 for lang_re in self.params.get('subtitleslangs'):
2385 if lang_re == 'all':
2386 requested_langs.extend(all_sub_langs)
2387 continue
2388 discard = lang_re[0] == '-'
2389 if discard:
2390 lang_re = lang_re[1:]
2391 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2392 if discard:
2393 for lang in current_langs:
2394 while lang in requested_langs:
2395 requested_langs.remove(lang)
2396 else:
2397 requested_langs.extend(current_langs)
2398 requested_langs = orderedSet(requested_langs)
2399 elif 'en' in available_subs:
2400 requested_langs = ['en']
2401 else:
2402 requested_langs = [list(all_sub_langs)[0]]
2403 if requested_langs:
2404 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2405
2406 formats_query = self.params.get('subtitlesformat', 'best')
2407 formats_preference = formats_query.split('/') if formats_query else []
2408 subs = {}
2409 for lang in requested_langs:
2410 formats = available_subs.get(lang)
2411 if formats is None:
2412 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2413 continue
2414 for ext in formats_preference:
2415 if ext == 'best':
2416 f = formats[-1]
2417 break
2418 matches = list(filter(lambda f: f['ext'] == ext, formats))
2419 if matches:
2420 f = matches[-1]
2421 break
2422 else:
2423 f = formats[-1]
2424 self.report_warning(
2425 'No subtitle format found matching "%s" for language %s, '
2426 'using %s' % (formats_query, lang, f['ext']))
2427 subs[lang] = f
2428 return subs
2429
2430 def __forced_printings(self, info_dict, filename, incomplete):
2431 def print_mandatory(field, actual_field=None):
2432 if actual_field is None:
2433 actual_field = field
2434 if (self.params.get('force%s' % field, False)
2435 and (not incomplete or info_dict.get(actual_field) is not None)):
2436 self.to_stdout(info_dict[actual_field])
2437
2438 def print_optional(field):
2439 if (self.params.get('force%s' % field, False)
2440 and info_dict.get(field) is not None):
2441 self.to_stdout(info_dict[field])
2442
2443 info_dict = info_dict.copy()
2444 if filename is not None:
2445 info_dict['filename'] = filename
2446 if info_dict.get('requested_formats') is not None:
2447 # For RTMP URLs, also include the playpath
2448 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2449 elif 'url' in info_dict:
2450 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2451
2452 if self.params.get('forceprint') or self.params.get('forcejson'):
2453 self.post_extract(info_dict)
2454 for tmpl in self.params.get('forceprint', []):
2455 self.to_stdout(self.evaluate_outtmpl(
2456 f'%({tmpl})s' if re.match(r'\w+$', tmpl) else tmpl, info_dict))
2457
2458 print_mandatory('title')
2459 print_mandatory('id')
2460 print_mandatory('url', 'urls')
2461 print_optional('thumbnail')
2462 print_optional('description')
2463 print_optional('filename')
2464 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2465 self.to_stdout(formatSeconds(info_dict['duration']))
2466 print_mandatory('format')
2467
2468 if self.params.get('forcejson'):
2469 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2470
2471 def dl(self, name, info, subtitle=False, test=False):
2472 if not info.get('url'):
2473 self.raise_no_formats(info, True)
2474
2475 if test:
2476 verbose = self.params.get('verbose')
2477 params = {
2478 'test': True,
2479 'quiet': not verbose,
2480 'verbose': verbose,
2481 'noprogress': not verbose,
2482 'nopart': True,
2483 'skip_unavailable_fragments': False,
2484 'keep_fragments': False,
2485 'overwrites': True,
2486 '_no_ytdl_file': True,
2487 }
2488 else:
2489 params = self.params
2490 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2491 if not test:
2492 for ph in self._progress_hooks:
2493 fd.add_progress_hook(ph)
2494 urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2495 self.write_debug('Invoking downloader on "%s"' % urls)
2496 new_info = dict(info)
2497 if new_info.get('http_headers') is None:
2498 new_info['http_headers'] = self._calc_headers(new_info)
2499 return fd.download(name, new_info, subtitle)
2500
2501 def process_info(self, info_dict):
2502 """Process a single resolved IE result."""
2503
2504 assert info_dict.get('_type', 'video') == 'video'
2505
2506 max_downloads = self.params.get('max_downloads')
2507 if max_downloads is not None:
2508 if self._num_downloads >= int(max_downloads):
2509 raise MaxDownloadsReached()
2510
2511 # TODO: backward compatibility, to be removed
2512 info_dict['fulltitle'] = info_dict['title']
2513
2514 if 'format' not in info_dict and 'ext' in info_dict:
2515 info_dict['format'] = info_dict['ext']
2516
2517 if self._match_entry(info_dict) is not None:
2518 return
2519
2520 self.post_extract(info_dict)
2521 self._num_downloads += 1
2522
2523 # info_dict['_filename'] needs to be set for backward compatibility
2524 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2525 temp_filename = self.prepare_filename(info_dict, 'temp')
2526 files_to_move = {}
2527
2528 # Forced printings
2529 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2530
2531 if self.params.get('simulate'):
2532 if self.params.get('force_write_download_archive', False):
2533 self.record_download_archive(info_dict)
2534 # Do nothing else if in simulate mode
2535 return
2536
2537 if full_filename is None:
2538 return
2539 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2540 return
2541 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2542 return
2543
2544 if self._write_description('video', info_dict,
2545 self.prepare_filename(info_dict, 'description')) is None:
2546 return
2547
2548 sub_files = self._write_subtitles(info_dict, temp_filename)
2549 if sub_files is None:
2550 return
2551 files_to_move.update(dict(sub_files))
2552
2553 thumb_files = self._write_thumbnails(
2554 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2555 if thumb_files is None:
2556 return
2557 files_to_move.update(dict(thumb_files))
2558
2559 infofn = self.prepare_filename(info_dict, 'infojson')
2560 _infojson_written = self._write_info_json('video', info_dict, infofn)
2561 if _infojson_written:
2562 info_dict['__infojson_filename'] = infofn
2563 elif _infojson_written is None:
2564 return
2565
2566 # Note: Annotations are deprecated
2567 annofn = None
2568 if self.params.get('writeannotations', False):
2569 annofn = self.prepare_filename(info_dict, 'annotation')
2570 if annofn:
2571 if not self._ensure_dir_exists(encodeFilename(annofn)):
2572 return
2573 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2574 self.to_screen('[info] Video annotations are already present')
2575 elif not info_dict.get('annotations'):
2576 self.report_warning('There are no annotations to write.')
2577 else:
2578 try:
2579 self.to_screen('[info] Writing video annotations to: ' + annofn)
2580 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2581 annofile.write(info_dict['annotations'])
2582 except (KeyError, TypeError):
2583 self.report_warning('There are no annotations to write.')
2584 except (OSError, IOError):
2585 self.report_error('Cannot write annotations file: ' + annofn)
2586 return
2587
2588 # Write internet shortcut files
2589 url_link = webloc_link = desktop_link = False
2590 if self.params.get('writelink', False):
2591 if sys.platform == "darwin": # macOS.
2592 webloc_link = True
2593 elif sys.platform.startswith("linux"):
2594 desktop_link = True
2595 else: # if sys.platform in ['win32', 'cygwin']:
2596 url_link = True
2597 if self.params.get('writeurllink', False):
2598 url_link = True
2599 if self.params.get('writewebloclink', False):
2600 webloc_link = True
2601 if self.params.get('writedesktoplink', False):
2602 desktop_link = True
2603
2604 if url_link or webloc_link or desktop_link:
2605 if 'webpage_url' not in info_dict:
2606 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2607 return
2608 ascii_url = iri_to_uri(info_dict['webpage_url'])
2609
2610 def _write_link_file(extension, template, newline, embed_filename):
2611 linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
2612 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2613 self.to_screen('[info] Internet shortcut is already present')
2614 else:
2615 try:
2616 self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
2617 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
2618 template_vars = {'url': ascii_url}
2619 if embed_filename:
2620 template_vars['filename'] = linkfn[:-(len(extension) + 1)]
2621 linkfile.write(template % template_vars)
2622 except (OSError, IOError):
2623 self.report_error('Cannot write internet shortcut ' + linkfn)
2624 return False
2625 return True
2626
2627 if url_link:
2628 if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
2629 return
2630 if webloc_link:
2631 if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
2632 return
2633 if desktop_link:
2634 if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
2635 return
2636
2637 try:
2638 info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2639 except PostProcessingError as err:
2640 self.report_error('Preprocessing: %s' % str(err))
2641 return
2642
2643 must_record_download_archive = False
2644 if self.params.get('skip_download', False):
2645 info_dict['filepath'] = temp_filename
2646 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2647 info_dict['__files_to_move'] = files_to_move
2648 info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2649 else:
2650 # Download
2651 info_dict.setdefault('__postprocessors', [])
2652 try:
2653
2654 def existing_file(*filepaths):
2655 ext = info_dict.get('ext')
2656 final_ext = self.params.get('final_ext', ext)
2657 existing_files = []
2658 for file in orderedSet(filepaths):
2659 if final_ext != ext:
2660 converted = replace_extension(file, final_ext, ext)
2661 if os.path.exists(encodeFilename(converted)):
2662 existing_files.append(converted)
2663 if os.path.exists(encodeFilename(file)):
2664 existing_files.append(file)
2665
2666 if not existing_files or self.params.get('overwrites', False):
2667 for file in orderedSet(existing_files):
2668 self.report_file_delete(file)
2669 os.remove(encodeFilename(file))
2670 return None
2671
2672 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2673 return existing_files[0]
2674
2675 success = True
2676 if info_dict.get('requested_formats') is not None:
2677
2678 def compatible_formats(formats):
2679 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2680 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2681 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2682 if len(video_formats) > 2 or len(audio_formats) > 2:
2683 return False
2684
2685 # Check extension
2686 exts = set(format.get('ext') for format in formats)
2687 COMPATIBLE_EXTS = (
2688 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2689 set(('webm',)),
2690 )
2691 for ext_sets in COMPATIBLE_EXTS:
2692 if ext_sets.issuperset(exts):
2693 return True
2694 # TODO: Check acodec/vcodec
2695 return False
2696
2697 requested_formats = info_dict['requested_formats']
2698 old_ext = info_dict['ext']
2699 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2700 info_dict['ext'] = 'mkv'
2701 self.report_warning(
2702 'Requested formats are incompatible for merge and will be merged into mkv.')
2703 new_ext = info_dict['ext']
2704
2705 def correct_ext(filename, ext=new_ext):
2706 if filename == '-':
2707 return filename
2708 filename_real_ext = os.path.splitext(filename)[1][1:]
2709 filename_wo_ext = (
2710 os.path.splitext(filename)[0]
2711 if filename_real_ext in (old_ext, new_ext)
2712 else filename)
2713 return '%s.%s' % (filename_wo_ext, ext)
2714
2715 # Ensure filename always has a correct extension for successful merge
2716 full_filename = correct_ext(full_filename)
2717 temp_filename = correct_ext(temp_filename)
2718 dl_filename = existing_file(full_filename, temp_filename)
2719 info_dict['__real_download'] = False
2720
2721 _protocols = set(determine_protocol(f) for f in requested_formats)
2722 if len(_protocols) == 1: # All requested formats have same protocol
2723 info_dict['protocol'] = _protocols.pop()
2724 directly_mergable = FFmpegFD.can_merge_formats(info_dict, self.params)
2725 if dl_filename is not None:
2726 self.report_file_already_downloaded(dl_filename)
2727 elif (directly_mergable and get_suitable_downloader(
2728 info_dict, self.params, to_stdout=(temp_filename == '-')) == FFmpegFD):
2729 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2730 success, real_download = self.dl(temp_filename, info_dict)
2731 info_dict['__real_download'] = real_download
2732 else:
2733 downloaded = []
2734 merger = FFmpegMergerPP(self)
2735 if self.params.get('allow_unplayable_formats'):
2736 self.report_warning(
2737 'You have requested merging of multiple formats '
2738 'while also allowing unplayable formats to be downloaded. '
2739 'The formats won\'t be merged to prevent data corruption.')
2740 elif not merger.available:
2741 self.report_warning(
2742 'You have requested merging of multiple formats but ffmpeg is not installed. '
2743 'The formats won\'t be merged.')
2744
2745 if temp_filename == '-':
2746 reason = ('using a downloader other than ffmpeg' if directly_mergable
2747 else 'but the formats are incompatible for simultaneous download' if merger.available
2748 else 'but ffmpeg is not installed')
2749 self.report_warning(
2750 f'You have requested downloading multiple formats to stdout {reason}. '
2751 'The formats will be streamed one after the other')
2752 fname = temp_filename
2753 for f in requested_formats:
2754 new_info = dict(info_dict)
2755 del new_info['requested_formats']
2756 new_info.update(f)
2757 if temp_filename != '-':
2758 fname = prepend_extension(
2759 correct_ext(temp_filename, new_info['ext']),
2760 'f%s' % f['format_id'], new_info['ext'])
2761 if not self._ensure_dir_exists(fname):
2762 return
2763 f['filepath'] = fname
2764 downloaded.append(fname)
2765 partial_success, real_download = self.dl(fname, new_info)
2766 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2767 success = success and partial_success
2768 if merger.available and not self.params.get('allow_unplayable_formats'):
2769 info_dict['__postprocessors'].append(merger)
2770 info_dict['__files_to_merge'] = downloaded
2771 # Even if there were no downloads, it is being merged only now
2772 info_dict['__real_download'] = True
2773 else:
2774 for file in downloaded:
2775 files_to_move[file] = None
2776 else:
2777 # Just a single file
2778 dl_filename = existing_file(full_filename, temp_filename)
2779 if dl_filename is None or dl_filename == temp_filename:
2780 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
2781 # So we should try to resume the download
2782 success, real_download = self.dl(temp_filename, info_dict)
2783 info_dict['__real_download'] = real_download
2784 else:
2785 self.report_file_already_downloaded(dl_filename)
2786
2787 dl_filename = dl_filename or temp_filename
2788 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2789
2790 except network_exceptions as err:
2791 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2792 return
2793 except (OSError, IOError) as err:
2794 raise UnavailableVideoError(err)
2795 except (ContentTooShortError, ) as err:
2796 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2797 return
2798
2799 if success and full_filename != '-':
2800
2801 def fixup():
2802 do_fixup = True
2803 fixup_policy = self.params.get('fixup')
2804 vid = info_dict['id']
2805
2806 if fixup_policy in ('ignore', 'never'):
2807 return
2808 elif fixup_policy == 'warn':
2809 do_fixup = False
2810 elif fixup_policy != 'force':
2811 assert fixup_policy in ('detect_or_warn', None)
2812 if not info_dict.get('__real_download'):
2813 do_fixup = False
2814
2815 def ffmpeg_fixup(cndn, msg, cls):
2816 if not cndn:
2817 return
2818 if not do_fixup:
2819 self.report_warning(f'{vid}: {msg}')
2820 return
2821 pp = cls(self)
2822 if pp.available:
2823 info_dict['__postprocessors'].append(pp)
2824 else:
2825 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
2826
2827 stretched_ratio = info_dict.get('stretched_ratio')
2828 ffmpeg_fixup(
2829 stretched_ratio not in (1, None),
2830 f'Non-uniform pixel ratio {stretched_ratio}',
2831 FFmpegFixupStretchedPP)
2832
2833 ffmpeg_fixup(
2834 (info_dict.get('requested_formats') is None
2835 and info_dict.get('container') == 'm4a_dash'
2836 and info_dict.get('ext') == 'm4a'),
2837 'writing DASH m4a. Only some players support this container',
2838 FFmpegFixupM4aPP)
2839
2840 downloader = (get_suitable_downloader(info_dict, self.params).__name__
2841 if 'protocol' in info_dict else None)
2842 ffmpeg_fixup(info_dict.get('requested_formats') is None and downloader == 'HlsFD',
2843 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
2844 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
2845 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
2846
2847 fixup()
2848 try:
2849 info_dict = self.post_process(dl_filename, info_dict, files_to_move)
2850 except PostProcessingError as err:
2851 self.report_error('Postprocessing: %s' % str(err))
2852 return
2853 try:
2854 for ph in self._post_hooks:
2855 ph(info_dict['filepath'])
2856 except Exception as err:
2857 self.report_error('post hooks: %s' % str(err))
2858 return
2859 must_record_download_archive = True
2860
2861 if must_record_download_archive or self.params.get('force_write_download_archive', False):
2862 self.record_download_archive(info_dict)
2863 max_downloads = self.params.get('max_downloads')
2864 if max_downloads is not None and self._num_downloads >= int(max_downloads):
2865 raise MaxDownloadsReached()
2866
2867 def download(self, url_list):
2868 """Download a given list of URLs."""
2869 outtmpl = self.outtmpl_dict['default']
2870 if (len(url_list) > 1
2871 and outtmpl != '-'
2872 and '%' not in outtmpl
2873 and self.params.get('max_downloads') != 1):
2874 raise SameFileError(outtmpl)
2875
2876 for url in url_list:
2877 try:
2878 # It also downloads the videos
2879 res = self.extract_info(
2880 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2881 except UnavailableVideoError:
2882 self.report_error('unable to download video')
2883 except MaxDownloadsReached:
2884 self.to_screen('[info] Maximum number of downloads reached')
2885 raise
2886 except ExistingVideoReached:
2887 self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing')
2888 raise
2889 except RejectedVideoReached:
2890 self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject')
2891 raise
2892 else:
2893 if self.params.get('dump_single_json', False):
2894 self.post_extract(res)
2895 self.to_stdout(json.dumps(self.sanitize_info(res)))
2896
2897 return self._download_retcode
2898
2899 def download_with_info_file(self, info_filename):
2900 with contextlib.closing(fileinput.FileInput(
2901 [info_filename], mode='r',
2902 openhook=fileinput.hook_encoded('utf-8'))) as f:
2903 # FileInput doesn't have a read method, we can't call json.load
2904 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
2905 try:
2906 self.process_ie_result(info, download=True)
2907 except (DownloadError, EntryNotInPlaylist, ThrottledDownload):
2908 webpage_url = info.get('webpage_url')
2909 if webpage_url is not None:
2910 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2911 return self.download([webpage_url])
2912 else:
2913 raise
2914 return self._download_retcode
2915
2916 @staticmethod
2917 def sanitize_info(info_dict, remove_private_keys=False):
2918 ''' Sanitize the infodict for converting to json '''
2919 if info_dict is None:
2920 return info_dict
2921 info_dict.setdefault('epoch', int(time.time()))
2922 remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
2923 keep_keys = ['_type'], # Always keep this to facilitate load-info-json
2924 if remove_private_keys:
2925 remove_keys |= {
2926 'requested_formats', 'requested_subtitles', 'requested_entries',
2927 'filepath', 'entries', 'original_url', 'playlist_autonumber',
2928 }
2929 empty_values = (None, {}, [], set(), tuple())
2930 reject = lambda k, v: k not in keep_keys and (
2931 k.startswith('_') or k in remove_keys or v in empty_values)
2932 else:
2933 reject = lambda k, v: k in remove_keys
2934 filter_fn = lambda obj: (
2935 list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
2936 else obj if not isinstance(obj, dict)
2937 else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
2938 return filter_fn(info_dict)
2939
2940 @staticmethod
2941 def filter_requested_info(info_dict, actually_filter=True):
2942 ''' Alias of sanitize_info for backward compatibility '''
2943 return YoutubeDL.sanitize_info(info_dict, actually_filter)
2944
2945 def run_pp(self, pp, infodict):
2946 files_to_delete = []
2947 if '__files_to_move' not in infodict:
2948 infodict['__files_to_move'] = {}
2949 try:
2950 files_to_delete, infodict = pp.run(infodict)
2951 except PostProcessingError as e:
2952 # Must be True and not 'only_download'
2953 if self.params.get('ignoreerrors') is True:
2954 self.report_error(e)
2955 return infodict
2956 raise
2957
2958 if not files_to_delete:
2959 return infodict
2960 if self.params.get('keepvideo', False):
2961 for f in files_to_delete:
2962 infodict['__files_to_move'].setdefault(f, '')
2963 else:
2964 for old_filename in set(files_to_delete):
2965 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2966 try:
2967 os.remove(encodeFilename(old_filename))
2968 except (IOError, OSError):
2969 self.report_warning('Unable to remove downloaded original file')
2970 if old_filename in infodict['__files_to_move']:
2971 del infodict['__files_to_move'][old_filename]
2972 return infodict
2973
2974 @staticmethod
2975 def post_extract(info_dict):
2976 def actual_post_extract(info_dict):
2977 if info_dict.get('_type') in ('playlist', 'multi_video'):
2978 for video_dict in info_dict.get('entries', {}):
2979 actual_post_extract(video_dict or {})
2980 return
2981
2982 post_extractor = info_dict.get('__post_extractor') or (lambda: {})
2983 extra = post_extractor().items()
2984 info_dict.update(extra)
2985 info_dict.pop('__post_extractor', None)
2986
2987 original_infodict = info_dict.get('__original_infodict') or {}
2988 original_infodict.update(extra)
2989 original_infodict.pop('__post_extractor', None)
2990
2991 actual_post_extract(info_dict or {})
2992
2993 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
2994 info = dict(ie_info)
2995 info['__files_to_move'] = files_to_move or {}
2996 for pp in self._pps[key]:
2997 info = self.run_pp(pp, info)
2998 return info, info.pop('__files_to_move', None)
2999
3000 def post_process(self, filename, ie_info, files_to_move=None):
3001 """Run all the postprocessors on the given file."""
3002 info = dict(ie_info)
3003 info['filepath'] = filename
3004 info['__files_to_move'] = files_to_move or {}
3005
3006 for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
3007 info = self.run_pp(pp, info)
3008 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3009 del info['__files_to_move']
3010 for pp in self._pps['after_move']:
3011 info = self.run_pp(pp, info)
3012 return info
3013
3014 def _make_archive_id(self, info_dict):
3015 video_id = info_dict.get('id')
3016 if not video_id:
3017 return
3018 # Future-proof against any change in case
3019 # and backwards compatibility with prior versions
3020 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3021 if extractor is None:
3022 url = str_or_none(info_dict.get('url'))
3023 if not url:
3024 return
3025 # Try to find matching extractor for the URL and take its ie_key
3026 for ie_key, ie in self._ies.items():
3027 if ie.suitable(url):
3028 extractor = ie_key
3029 break
3030 else:
3031 return
3032 return '%s %s' % (extractor.lower(), video_id)
3033
3034 def in_download_archive(self, info_dict):
3035 fn = self.params.get('download_archive')
3036 if fn is None:
3037 return False
3038
3039 vid_id = self._make_archive_id(info_dict)
3040 if not vid_id:
3041 return False # Incomplete video information
3042
3043 return vid_id in self.archive
3044
3045 def record_download_archive(self, info_dict):
3046 fn = self.params.get('download_archive')
3047 if fn is None:
3048 return
3049 vid_id = self._make_archive_id(info_dict)
3050 assert vid_id
3051 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3052 archive_file.write(vid_id + '\n')
3053 self.archive.add(vid_id)
3054
3055 @staticmethod
3056 def format_resolution(format, default='unknown'):
3057 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3058 return 'audio only'
3059 if format.get('resolution') is not None:
3060 return format['resolution']
3061 if format.get('width') and format.get('height'):
3062 res = '%dx%d' % (format['width'], format['height'])
3063 elif format.get('height'):
3064 res = '%sp' % format['height']
3065 elif format.get('width'):
3066 res = '%dx?' % format['width']
3067 else:
3068 res = default
3069 if format.get('vcodec') == 'none' and format.get('acodec') == 'none':
3070 res += ' (images)'
3071 return res
3072
3073 def _format_note(self, fdict):
3074 res = ''
3075 if fdict.get('ext') in ['f4f', 'f4m']:
3076 res += '(unsupported) '
3077 if fdict.get('language'):
3078 if res:
3079 res += ' '
3080 res += '[%s] ' % fdict['language']
3081 if fdict.get('format_note') is not None:
3082 res += fdict['format_note'] + ' '
3083 if fdict.get('tbr') is not None:
3084 res += '%4dk ' % fdict['tbr']
3085 if fdict.get('container') is not None:
3086 if res:
3087 res += ', '
3088 res += '%s container' % fdict['container']
3089 if (fdict.get('vcodec') is not None
3090 and fdict.get('vcodec') != 'none'):
3091 if res:
3092 res += ', '
3093 res += fdict['vcodec']
3094 if fdict.get('vbr') is not None:
3095 res += '@'
3096 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3097 res += 'video@'
3098 if fdict.get('vbr') is not None:
3099 res += '%4dk' % fdict['vbr']
3100 if fdict.get('fps') is not None:
3101 if res:
3102 res += ', '
3103 res += '%sfps' % fdict['fps']
3104 if fdict.get('acodec') is not None:
3105 if res:
3106 res += ', '
3107 if fdict['acodec'] == 'none':
3108 res += 'video only'
3109 else:
3110 res += '%-5s' % fdict['acodec']
3111 elif fdict.get('abr') is not None:
3112 if res:
3113 res += ', '
3114 res += 'audio'
3115 if fdict.get('abr') is not None:
3116 res += '@%3dk' % fdict['abr']
3117 if fdict.get('asr') is not None:
3118 res += ' (%5dHz)' % fdict['asr']
3119 if fdict.get('filesize') is not None:
3120 if res:
3121 res += ', '
3122 res += format_bytes(fdict['filesize'])
3123 elif fdict.get('filesize_approx') is not None:
3124 if res:
3125 res += ', '
3126 res += '~' + format_bytes(fdict['filesize_approx'])
3127 return res
3128
3129 def list_formats(self, info_dict):
3130 formats = info_dict.get('formats', [info_dict])
3131 new_format = (
3132 'list-formats' not in self.params.get('compat_opts', [])
3133 and self.params.get('listformats_table', True) is not False)
3134 if new_format:
3135 table = [
3136 [
3137 format_field(f, 'format_id'),
3138 format_field(f, 'ext'),
3139 self.format_resolution(f),
3140 format_field(f, 'fps', '%d'),
3141 '|',
3142 format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
3143 format_field(f, 'tbr', '%4dk'),
3144 shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
3145 '|',
3146 format_field(f, 'vcodec', default='unknown').replace('none', ''),
3147 format_field(f, 'vbr', '%4dk'),
3148 format_field(f, 'acodec', default='unknown').replace('none', ''),
3149 format_field(f, 'abr', '%3dk'),
3150 format_field(f, 'asr', '%5dHz'),
3151 ', '.join(filter(None, (
3152 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
3153 format_field(f, 'language', '[%s]'),
3154 format_field(f, 'format_note'),
3155 format_field(f, 'container', ignore=(None, f.get('ext'))),
3156 ))),
3157 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3158 header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO',
3159 '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO']
3160 else:
3161 table = [
3162 [
3163 format_field(f, 'format_id'),
3164 format_field(f, 'ext'),
3165 self.format_resolution(f),
3166 self._format_note(f)]
3167 for f in formats
3168 if f.get('preference') is None or f['preference'] >= -1000]
3169 header_line = ['format code', 'extension', 'resolution', 'note']
3170
3171 self.to_screen(
3172 '[info] Available formats for %s:' % info_dict['id'])
3173 self.to_stdout(render_table(
3174 header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format))
3175
3176 def list_thumbnails(self, info_dict):
3177 thumbnails = list(info_dict.get('thumbnails'))
3178 if not thumbnails:
3179 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3180 return
3181
3182 self.to_screen(
3183 '[info] Thumbnails for %s:' % info_dict['id'])
3184 self.to_stdout(render_table(
3185 ['ID', 'width', 'height', 'URL'],
3186 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3187
3188 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3189 if not subtitles:
3190 self.to_screen('%s has no %s' % (video_id, name))
3191 return
3192 self.to_screen(
3193 'Available %s for %s:' % (name, video_id))
3194
3195 def _row(lang, formats):
3196 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3197 if len(set(names)) == 1:
3198 names = [] if names[0] == 'unknown' else names[:1]
3199 return [lang, ', '.join(names), ', '.join(exts)]
3200
3201 self.to_stdout(render_table(
3202 ['Language', 'Name', 'Formats'],
3203 [_row(lang, formats) for lang, formats in subtitles.items()],
3204 hideEmpty=True))
3205
3206 def urlopen(self, req):
3207 """ Start an HTTP download """
3208 if isinstance(req, compat_basestring):
3209 req = sanitized_Request(req)
3210 return self._opener.open(req, timeout=self._socket_timeout)
3211
3212 def print_debug_header(self):
3213 if not self.params.get('verbose'):
3214 return
3215
3216 stdout_encoding = getattr(
3217 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
3218 encoding_str = (
3219 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
3220 locale.getpreferredencoding(),
3221 sys.getfilesystemencoding(),
3222 stdout_encoding,
3223 self.get_encoding()))
3224 write_string(encoding_str, encoding=None)
3225
3226 source = detect_variant()
3227 self._write_string('[debug] yt-dlp version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})'))
3228 if _LAZY_LOADER:
3229 self._write_string('[debug] Lazy loading extractors enabled\n')
3230 if plugin_extractors or plugin_postprocessors:
3231 self._write_string('[debug] Plugins: %s\n' % [
3232 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3233 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3234 if self.params.get('compat_opts'):
3235 self._write_string(
3236 '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts')))
3237 try:
3238 sp = subprocess.Popen(
3239 ['git', 'rev-parse', '--short', 'HEAD'],
3240 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3241 cwd=os.path.dirname(os.path.abspath(__file__)))
3242 out, err = process_communicate_or_kill(sp)
3243 out = out.decode().strip()
3244 if re.match('[0-9a-f]+', out):
3245 self._write_string('[debug] Git HEAD: %s\n' % out)
3246 except Exception:
3247 try:
3248 sys.exc_clear()
3249 except Exception:
3250 pass
3251
3252 def python_implementation():
3253 impl_name = platform.python_implementation()
3254 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3255 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3256 return impl_name
3257
3258 self._write_string('[debug] Python version %s (%s %s) - %s\n' % (
3259 platform.python_version(),
3260 python_implementation(),
3261 platform.architecture()[0],
3262 platform_name()))
3263
3264 exe_versions = FFmpegPostProcessor.get_versions(self)
3265 exe_versions['rtmpdump'] = rtmpdump_version()
3266 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3267 exe_str = ', '.join(
3268 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3269 ) or 'none'
3270 self._write_string('[debug] exe versions: %s\n' % exe_str)
3271
3272 from .downloader.websocket import has_websockets
3273 from .postprocessor.embedthumbnail import has_mutagen
3274 from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
3275
3276 lib_str = ', '.join(sorted(filter(None, (
3277 compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0],
3278 has_websockets and 'websockets',
3279 has_mutagen and 'mutagen',
3280 SQLITE_AVAILABLE and 'sqlite',
3281 KEYRING_AVAILABLE and 'keyring',
3282 )))) or 'none'
3283 self._write_string('[debug] Optional libraries: %s\n' % lib_str)
3284
3285 proxy_map = {}
3286 for handler in self._opener.handlers:
3287 if hasattr(handler, 'proxies'):
3288 proxy_map.update(handler.proxies)
3289 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
3290
3291 if self.params.get('call_home', False):
3292 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3293 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
3294 return
3295 latest_version = self.urlopen(
3296 'https://yt-dl.org/latest/version').read().decode('utf-8')
3297 if version_tuple(latest_version) > version_tuple(__version__):
3298 self.report_warning(
3299 'You are using an outdated version (newest version: %s)! '
3300 'See https://yt-dl.org/update if you need help updating.' %
3301 latest_version)
3302
3303 def _setup_opener(self):
3304 timeout_val = self.params.get('socket_timeout')
3305 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
3306
3307 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3308 opts_cookiefile = self.params.get('cookiefile')
3309 opts_proxy = self.params.get('proxy')
3310
3311 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3312
3313 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3314 if opts_proxy is not None:
3315 if opts_proxy == '':
3316 proxies = {}
3317 else:
3318 proxies = {'http': opts_proxy, 'https': opts_proxy}
3319 else:
3320 proxies = compat_urllib_request.getproxies()
3321 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3322 if 'http' in proxies and 'https' not in proxies:
3323 proxies['https'] = proxies['http']
3324 proxy_handler = PerRequestProxyHandler(proxies)
3325
3326 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3327 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3328 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3329 redirect_handler = YoutubeDLRedirectHandler()
3330 data_handler = compat_urllib_request_DataHandler()
3331
3332 # When passing our own FileHandler instance, build_opener won't add the
3333 # default FileHandler and allows us to disable the file protocol, which
3334 # can be used for malicious purposes (see
3335 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3336 file_handler = compat_urllib_request.FileHandler()
3337
3338 def file_open(*args, **kwargs):
3339 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3340 file_handler.file_open = file_open
3341
3342 opener = compat_urllib_request.build_opener(
3343 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3344
3345 # Delete the default user-agent header, which would otherwise apply in
3346 # cases where our custom HTTP handler doesn't come into play
3347 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3348 opener.addheaders = []
3349 self._opener = opener
3350
3351 def encode(self, s):
3352 if isinstance(s, bytes):
3353 return s # Already encoded
3354
3355 try:
3356 return s.encode(self.get_encoding())
3357 except UnicodeEncodeError as err:
3358 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3359 raise
3360
3361 def get_encoding(self):
3362 encoding = self.params.get('encoding')
3363 if encoding is None:
3364 encoding = preferredencoding()
3365 return encoding
3366
3367 def _write_info_json(self, label, ie_result, infofn):
3368 ''' Write infojson and returns True = written, False = skip, None = error '''
3369 if not self.params.get('writeinfojson'):
3370 return False
3371 elif not infofn:
3372 self.write_debug(f'Skipping writing {label} infojson')
3373 return False
3374 elif not self._ensure_dir_exists(infofn):
3375 return None
3376 elif not self.params.get('overwrites', True) and os.path.exists(infofn):
3377 self.to_screen(f'[info] {label.title()} metadata is already present')
3378 else:
3379 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3380 try:
3381 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3382 except (OSError, IOError):
3383 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3384 return None
3385 return True
3386
3387 def _write_description(self, label, ie_result, descfn):
3388 ''' Write description and returns True = written, False = skip, None = error '''
3389 if not self.params.get('writedescription'):
3390 return False
3391 elif not descfn:
3392 self.write_debug(f'Skipping writing {label} description')
3393 return False
3394 elif not self._ensure_dir_exists(descfn):
3395 return None
3396 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3397 self.to_screen(f'[info] {label.title()} description is already present')
3398 elif ie_result.get('description') is None:
3399 self.report_warning(f'There\'s no {label} description to write')
3400 return False
3401 else:
3402 try:
3403 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3404 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3405 descfile.write(ie_result['description'])
3406 except (OSError, IOError):
3407 self.report_error(f'Cannot write {label} description file {descfn}')
3408 return None
3409 return True
3410
3411 def _write_subtitles(self, info_dict, filename):
3412 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3413 ret = []
3414 subtitles = info_dict.get('requested_subtitles')
3415 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3416 # subtitles download errors are already managed as troubles in relevant IE
3417 # that way it will silently go on when used with unsupporting IE
3418 return ret
3419
3420 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3421 if not sub_filename_base:
3422 self.to_screen('[info] Skipping writing video subtitles')
3423 return ret
3424 for sub_lang, sub_info in subtitles.items():
3425 sub_format = sub_info['ext']
3426 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3427 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3428 if not self.params.get('overwrites', True) and os.path.exists(sub_filename):
3429 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3430 sub_info['filepath'] = sub_filename
3431 ret.append((sub_filename, sub_filename_final))
3432 continue
3433
3434 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3435 if sub_info.get('data') is not None:
3436 try:
3437 # Use newline='' to prevent conversion of newline characters
3438 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3439 with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3440 subfile.write(sub_info['data'])
3441 sub_info['filepath'] = sub_filename
3442 ret.append((sub_filename, sub_filename_final))
3443 continue
3444 except (OSError, IOError):
3445 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3446 return None
3447
3448 try:
3449 sub_copy = sub_info.copy()
3450 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3451 self.dl(sub_filename, sub_copy, subtitle=True)
3452 sub_info['filepath'] = sub_filename
3453 ret.append((sub_filename, sub_filename_final))
3454 except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3455 self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
3456 continue
3457 return ret
3458
3459 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3460 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3461 write_all = self.params.get('write_all_thumbnails', False)
3462 thumbnails, ret = [], []
3463 if write_all or self.params.get('writethumbnail', False):
3464 thumbnails = info_dict.get('thumbnails') or []
3465 multiple = write_all and len(thumbnails) > 1
3466
3467 if thumb_filename_base is None:
3468 thumb_filename_base = filename
3469 if thumbnails and not thumb_filename_base:
3470 self.write_debug(f'Skipping writing {label} thumbnail')
3471 return ret
3472
3473 for t in thumbnails[::-1]:
3474 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3475 thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '')
3476 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3477 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3478
3479 if not self.params.get('overwrites', True) and os.path.exists(thumb_filename):
3480 ret.append((thumb_filename, thumb_filename_final))
3481 t['filepath'] = thumb_filename
3482 self.to_screen(f'[info] {thumb_display_id.title()} is already present')
3483 else:
3484 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3485 try:
3486 uf = self.urlopen(t['url'])
3487 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3488 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3489 shutil.copyfileobj(uf, thumbf)
3490 ret.append((thumb_filename, thumb_filename_final))
3491 t['filepath'] = thumb_filename
3492 except network_exceptions as err:
3493 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3494 if ret and not write_all:
3495 break
3496 return ret