]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
Handle more playlist errors with `-i`
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import sys
23 import tempfile
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30 from zipimport import zipimporter
31
32 from .compat import (
33 compat_basestring,
34 compat_get_terminal_size,
35 compat_kwargs,
36 compat_numeric_types,
37 compat_os_name,
38 compat_shlex_quote,
39 compat_str,
40 compat_tokenize_tokenize,
41 compat_urllib_error,
42 compat_urllib_request,
43 compat_urllib_request_DataHandler,
44 )
45 from .cookies import load_cookies
46 from .utils import (
47 age_restricted,
48 args_to_str,
49 ContentTooShortError,
50 date_from_str,
51 DateRange,
52 DEFAULT_OUTTMPL,
53 determine_ext,
54 determine_protocol,
55 DOT_DESKTOP_LINK_TEMPLATE,
56 DOT_URL_LINK_TEMPLATE,
57 DOT_WEBLOC_LINK_TEMPLATE,
58 DownloadError,
59 encode_compat_str,
60 encodeFilename,
61 EntryNotInPlaylist,
62 error_to_compat_str,
63 ExistingVideoReached,
64 expand_path,
65 ExtractorError,
66 float_or_none,
67 format_bytes,
68 format_field,
69 STR_FORMAT_RE_TMPL,
70 STR_FORMAT_TYPES,
71 formatSeconds,
72 GeoRestrictedError,
73 HEADRequest,
74 int_or_none,
75 iri_to_uri,
76 ISO3166Utils,
77 LazyList,
78 locked_file,
79 make_dir,
80 make_HTTPS_handler,
81 MaxDownloadsReached,
82 network_exceptions,
83 orderedSet,
84 OUTTMPL_TYPES,
85 PagedList,
86 parse_filesize,
87 PerRequestProxyHandler,
88 platform_name,
89 PostProcessingError,
90 preferredencoding,
91 prepend_extension,
92 process_communicate_or_kill,
93 register_socks_protocols,
94 RejectedVideoReached,
95 render_table,
96 replace_extension,
97 SameFileError,
98 sanitize_filename,
99 sanitize_path,
100 sanitize_url,
101 sanitized_Request,
102 std_headers,
103 str_or_none,
104 strftime_or_none,
105 subtitles_filename,
106 ThrottledDownload,
107 to_high_limit_path,
108 traverse_obj,
109 try_get,
110 UnavailableVideoError,
111 url_basename,
112 variadic,
113 version_tuple,
114 write_json_file,
115 write_string,
116 YoutubeDLCookieProcessor,
117 YoutubeDLHandler,
118 YoutubeDLRedirectHandler,
119 )
120 from .cache import Cache
121 from .extractor import (
122 gen_extractor_classes,
123 get_info_extractor,
124 _LAZY_LOADER,
125 _PLUGIN_CLASSES
126 )
127 from .extractor.openload import PhantomJSwrapper
128 from .downloader import (
129 FFmpegFD,
130 get_suitable_downloader,
131 shorten_protocol_name
132 )
133 from .downloader.rtmp import rtmpdump_version
134 from .postprocessor import (
135 get_postprocessor,
136 FFmpegFixupDurationPP,
137 FFmpegFixupM3u8PP,
138 FFmpegFixupM4aPP,
139 FFmpegFixupStretchedPP,
140 FFmpegFixupTimestampPP,
141 FFmpegMergerPP,
142 FFmpegPostProcessor,
143 MoveFilesAfterDownloadPP,
144 )
145 from .version import __version__
146
147 if compat_os_name == 'nt':
148 import ctypes
149
150
151 class YoutubeDL(object):
152 """YoutubeDL class.
153
154 YoutubeDL objects are the ones responsible of downloading the
155 actual video file and writing it to disk if the user has requested
156 it, among some other tasks. In most cases there should be one per
157 program. As, given a video URL, the downloader doesn't know how to
158 extract all the needed information, task that InfoExtractors do, it
159 has to pass the URL to one of them.
160
161 For this, YoutubeDL objects have a method that allows
162 InfoExtractors to be registered in a given order. When it is passed
163 a URL, the YoutubeDL object handles it to the first InfoExtractor it
164 finds that reports being able to handle it. The InfoExtractor extracts
165 all the information about the video or videos the URL refers to, and
166 YoutubeDL process the extracted information, possibly using a File
167 Downloader to download the video.
168
169 YoutubeDL objects accept a lot of parameters. In order not to saturate
170 the object constructor with arguments, it receives a dictionary of
171 options instead. These options are available through the params
172 attribute for the InfoExtractors to use. The YoutubeDL also
173 registers itself as the downloader in charge for the InfoExtractors
174 that are added to it, so this is a "mutual registration".
175
176 Available options:
177
178 username: Username for authentication purposes.
179 password: Password for authentication purposes.
180 videopassword: Password for accessing a video.
181 ap_mso: Adobe Pass multiple-system operator identifier.
182 ap_username: Multiple-system operator account username.
183 ap_password: Multiple-system operator account password.
184 usenetrc: Use netrc for authentication instead.
185 verbose: Print additional info to stdout.
186 quiet: Do not print messages to stdout.
187 no_warnings: Do not print out anything for warnings.
188 forceprint: A list of templates to force print
189 forceurl: Force printing final URL. (Deprecated)
190 forcetitle: Force printing title. (Deprecated)
191 forceid: Force printing ID. (Deprecated)
192 forcethumbnail: Force printing thumbnail URL. (Deprecated)
193 forcedescription: Force printing description. (Deprecated)
194 forcefilename: Force printing final filename. (Deprecated)
195 forceduration: Force printing duration. (Deprecated)
196 forcejson: Force printing info_dict as JSON.
197 dump_single_json: Force printing the info_dict of the whole playlist
198 (or video) as a single JSON line.
199 force_write_download_archive: Force writing download archive regardless
200 of 'skip_download' or 'simulate'.
201 simulate: Do not download the video files. If unset (or None),
202 simulate only if listsubtitles, listformats or list_thumbnails is used
203 format: Video format code. see "FORMAT SELECTION" for more details.
204 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
205 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
206 extracting metadata even if the video is not actually
207 available for download (experimental)
208 format_sort: How to sort the video formats. see "Sorting Formats"
209 for more details.
210 format_sort_force: Force the given format_sort. see "Sorting Formats"
211 for more details.
212 allow_multiple_video_streams: Allow multiple video streams to be merged
213 into a single file
214 allow_multiple_audio_streams: Allow multiple audio streams to be merged
215 into a single file
216 check_formats Whether to test if the formats are downloadable.
217 Can be True (check all), False (check none)
218 or None (check only if requested by extractor)
219 paths: Dictionary of output paths. The allowed keys are 'home'
220 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
221 outtmpl: Dictionary of templates for output names. Allowed keys
222 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
223 For compatibility with youtube-dl, a single string can also be used
224 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
225 restrictfilenames: Do not allow "&" and spaces in file names
226 trim_file_name: Limit length of filename (extension excluded)
227 windowsfilenames: Force the filenames to be windows compatible
228 ignoreerrors: Do not stop on download errors
229 (Default True when running yt-dlp,
230 but False when directly accessing YoutubeDL class)
231 skip_playlist_after_errors: Number of allowed failures until the rest of
232 the playlist is skipped
233 force_generic_extractor: Force downloader to use the generic extractor
234 overwrites: Overwrite all video and metadata files if True,
235 overwrite only non-video files if None
236 and don't overwrite any file if False
237 For compatibility with youtube-dl,
238 "nooverwrites" may also be used instead
239 playliststart: Playlist item to start at.
240 playlistend: Playlist item to end at.
241 playlist_items: Specific indices of playlist to download.
242 playlistreverse: Download playlist items in reverse order.
243 playlistrandom: Download playlist items in random order.
244 matchtitle: Download only matching titles.
245 rejecttitle: Reject downloads for matching titles.
246 logger: Log messages to a logging.Logger instance.
247 logtostderr: Log messages to stderr instead of stdout.
248 writedescription: Write the video description to a .description file
249 writeinfojson: Write the video description to a .info.json file
250 clean_infojson: Remove private fields from the infojson
251 getcomments: Extract video comments. This will not be written to disk
252 unless writeinfojson is also given
253 writeannotations: Write the video annotations to a .annotations.xml file
254 writethumbnail: Write the thumbnail image to a file
255 allow_playlist_files: Whether to write playlists' description, infojson etc
256 also to disk when using the 'write*' options
257 write_all_thumbnails: Write all thumbnail formats to files
258 writelink: Write an internet shortcut file, depending on the
259 current platform (.url/.webloc/.desktop)
260 writeurllink: Write a Windows internet shortcut file (.url)
261 writewebloclink: Write a macOS internet shortcut file (.webloc)
262 writedesktoplink: Write a Linux internet shortcut file (.desktop)
263 writesubtitles: Write the video subtitles to a file
264 writeautomaticsub: Write the automatically generated subtitles to a file
265 allsubtitles: Deprecated - Use subtitleslangs = ['all']
266 Downloads all the subtitles of the video
267 (requires writesubtitles or writeautomaticsub)
268 listsubtitles: Lists all available subtitles for the video
269 subtitlesformat: The format code for subtitles
270 subtitleslangs: List of languages of the subtitles to download (can be regex).
271 The list may contain "all" to refer to all the available
272 subtitles. The language can be prefixed with a "-" to
273 exclude it from the requested languages. Eg: ['all', '-live_chat']
274 keepvideo: Keep the video file after post-processing
275 daterange: A DateRange object, download only if the upload_date is in the range.
276 skip_download: Skip the actual download of the video file
277 cachedir: Location of the cache files in the filesystem.
278 False to disable filesystem cache.
279 noplaylist: Download single video instead of a playlist if in doubt.
280 age_limit: An integer representing the user's age in years.
281 Unsuitable videos for the given age are skipped.
282 min_views: An integer representing the minimum view count the video
283 must have in order to not be skipped.
284 Videos without view count information are always
285 downloaded. None for no limit.
286 max_views: An integer representing the maximum view count.
287 Videos that are more popular than that are not
288 downloaded.
289 Videos without view count information are always
290 downloaded. None for no limit.
291 download_archive: File name of a file where all downloads are recorded.
292 Videos already present in the file are not downloaded
293 again.
294 break_on_existing: Stop the download process after attempting to download a
295 file that is in the archive.
296 break_on_reject: Stop the download process when encountering a video that
297 has been filtered out.
298 cookiefile: File name where cookies should be read from and dumped to
299 cookiesfrombrowser: A tuple containing the name of the browser and the profile
300 name/path from where cookies are loaded.
301 Eg: ('chrome', ) or (vivaldi, 'default')
302 nocheckcertificate:Do not verify SSL certificates
303 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
304 At the moment, this is only supported by YouTube.
305 proxy: URL of the proxy server to use
306 geo_verification_proxy: URL of the proxy to use for IP address verification
307 on geo-restricted sites.
308 socket_timeout: Time to wait for unresponsive hosts, in seconds
309 bidi_workaround: Work around buggy terminals without bidirectional text
310 support, using fridibi
311 debug_printtraffic:Print out sent and received HTTP traffic
312 include_ads: Download ads as well
313 default_search: Prepend this string if an input url is not valid.
314 'auto' for elaborate guessing
315 encoding: Use this encoding instead of the system-specified.
316 extract_flat: Do not resolve URLs, return the immediate result.
317 Pass in 'in_playlist' to only show this behavior for
318 playlist items.
319 postprocessors: A list of dictionaries, each with an entry
320 * key: The name of the postprocessor. See
321 yt_dlp/postprocessor/__init__.py for a list.
322 * when: When to run the postprocessor. Can be one of
323 pre_process|before_dl|post_process|after_move.
324 Assumed to be 'post_process' if not given
325 post_hooks: A list of functions that get called as the final step
326 for each video file, after all postprocessors have been
327 called. The filename will be passed as the only argument.
328 progress_hooks: A list of functions that get called on download
329 progress, with a dictionary with the entries
330 * status: One of "downloading", "error", or "finished".
331 Check this first and ignore unknown values.
332 * info_dict: The extracted info_dict
333
334 If status is one of "downloading", or "finished", the
335 following properties may also be present:
336 * filename: The final filename (always present)
337 * tmpfilename: The filename we're currently writing to
338 * downloaded_bytes: Bytes on disk
339 * total_bytes: Size of the whole file, None if unknown
340 * total_bytes_estimate: Guess of the eventual file size,
341 None if unavailable.
342 * elapsed: The number of seconds since download started.
343 * eta: The estimated time in seconds, None if unknown
344 * speed: The download speed in bytes/second, None if
345 unknown
346 * fragment_index: The counter of the currently
347 downloaded video fragment.
348 * fragment_count: The number of fragments (= individual
349 files that will be merged)
350
351 Progress hooks are guaranteed to be called at least once
352 (with status "finished") if the download is successful.
353 merge_output_format: Extension to use when merging formats.
354 final_ext: Expected final extension; used to detect when the file was
355 already downloaded and converted. "merge_output_format" is
356 replaced by this extension when given
357 fixup: Automatically correct known faults of the file.
358 One of:
359 - "never": do nothing
360 - "warn": only emit a warning
361 - "detect_or_warn": check whether we can do anything
362 about it, warn otherwise (default)
363 source_address: Client-side IP address to bind to.
364 call_home: Boolean, true iff we are allowed to contact the
365 yt-dlp servers for debugging. (BROKEN)
366 sleep_interval_requests: Number of seconds to sleep between requests
367 during extraction
368 sleep_interval: Number of seconds to sleep before each download when
369 used alone or a lower bound of a range for randomized
370 sleep before each download (minimum possible number
371 of seconds to sleep) when used along with
372 max_sleep_interval.
373 max_sleep_interval:Upper bound of a range for randomized sleep before each
374 download (maximum possible number of seconds to sleep).
375 Must only be used along with sleep_interval.
376 Actual sleep time will be a random float from range
377 [sleep_interval; max_sleep_interval].
378 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
379 listformats: Print an overview of available video formats and exit.
380 list_thumbnails: Print a table of all thumbnails and exit.
381 match_filter: A function that gets called with the info_dict of
382 every video.
383 If it returns a message, the video is ignored.
384 If it returns None, the video is downloaded.
385 match_filter_func in utils.py is one example for this.
386 no_color: Do not emit color codes in output.
387 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
388 HTTP header
389 geo_bypass_country:
390 Two-letter ISO 3166-2 country code that will be used for
391 explicit geographic restriction bypassing via faking
392 X-Forwarded-For HTTP header
393 geo_bypass_ip_block:
394 IP range in CIDR notation that will be used similarly to
395 geo_bypass_country
396
397 The following options determine which downloader is picked:
398 external_downloader: A dictionary of protocol keys and the executable of the
399 external downloader to use for it. The allowed protocols
400 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
401 Set the value to 'native' to use the native downloader
402 hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
403 or {'m3u8': 'ffmpeg'} instead.
404 Use the native HLS downloader instead of ffmpeg/avconv
405 if True, otherwise use ffmpeg/avconv if False, otherwise
406 use downloader suggested by extractor if None.
407 compat_opts: Compatibility options. See "Differences in default behavior".
408 The following options do not work when used through the API:
409 filename, abort-on-error, multistreams, no-live-chat,
410 no-clean-infojson, no-playlist-metafiles, no-keep-subs.
411 Refer __init__.py for their implementation
412
413 The following parameters are not used by YoutubeDL itself, they are used by
414 the downloader (see yt_dlp/downloader/common.py):
415 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
416 max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle,
417 xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size.
418
419 The following options are used by the post processors:
420 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
421 otherwise prefer ffmpeg. (avconv support is deprecated)
422 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
423 to the binary or its containing directory.
424 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
425 and a list of additional command-line arguments for the
426 postprocessor/executable. The dict can also have "PP+EXE" keys
427 which are used when the given exe is used by the given PP.
428 Use 'default' as the name for arguments to passed to all PP
429 For compatibility with youtube-dl, a single list of args
430 can also be used
431
432 The following options are used by the extractors:
433 extractor_retries: Number of times to retry for known errors
434 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
435 hls_split_discontinuity: Split HLS playlists to different formats at
436 discontinuities such as ad breaks (default: False)
437 extractor_args: A dictionary of arguments to be passed to the extractors.
438 See "EXTRACTOR ARGUMENTS" for details.
439 Eg: {'youtube': {'skip': ['dash', 'hls']}}
440 youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
441 If True (default), DASH manifests and related
442 data will be downloaded and processed by extractor.
443 You can reduce network I/O by disabling it if you don't
444 care about DASH. (only for youtube)
445 youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
446 If True (default), HLS manifests and related
447 data will be downloaded and processed by extractor.
448 You can reduce network I/O by disabling it if you don't
449 care about HLS. (only for youtube)
450 """
451
452 _NUMERIC_FIELDS = set((
453 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
454 'timestamp', 'upload_year', 'upload_month', 'upload_day',
455 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
456 'average_rating', 'comment_count', 'age_limit',
457 'start_time', 'end_time',
458 'chapter_number', 'season_number', 'episode_number',
459 'track_number', 'disc_number', 'release_year',
460 'playlist_index',
461 ))
462
463 params = None
464 _ies = {}
465 _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
466 _printed_messages = set()
467 _first_webpage_request = True
468 _download_retcode = None
469 _num_downloads = None
470 _playlist_level = 0
471 _playlist_urls = set()
472 _screen_file = None
473
474 def __init__(self, params=None, auto_init=True):
475 """Create a FileDownloader object with the given options."""
476 if params is None:
477 params = {}
478 self._ies = {}
479 self._ies_instances = {}
480 self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
481 self._printed_messages = set()
482 self._first_webpage_request = True
483 self._post_hooks = []
484 self._progress_hooks = []
485 self._download_retcode = 0
486 self._num_downloads = 0
487 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
488 self._err_file = sys.stderr
489 self.params = {
490 # Default parameters
491 'nocheckcertificate': False,
492 }
493 self.params.update(params)
494 self.cache = Cache(self)
495
496 if sys.version_info < (3, 6):
497 self.report_warning(
498 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
499
500 if self.params.get('allow_unplayable_formats'):
501 self.report_warning(
502 'You have asked for unplayable formats to be listed/downloaded. '
503 'This is a developer option intended for debugging. '
504 'If you experience any issues while using this option, DO NOT open a bug report')
505
506 def check_deprecated(param, option, suggestion):
507 if self.params.get(param) is not None:
508 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
509 return True
510 return False
511
512 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
513 if self.params.get('geo_verification_proxy') is None:
514 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
515
516 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
517 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
518 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
519
520 for msg in self.params.get('warnings', []):
521 self.report_warning(msg)
522
523 if self.params.get('overwrites') is None:
524 self.params.pop('overwrites', None)
525 elif self.params.get('nooverwrites') is not None:
526 # nooverwrites was unnecessarily changed to overwrites
527 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
528 # This ensures compatibility with both keys
529 self.params['overwrites'] = not self.params['nooverwrites']
530 else:
531 self.params['nooverwrites'] = not self.params['overwrites']
532
533 if params.get('bidi_workaround', False):
534 try:
535 import pty
536 master, slave = pty.openpty()
537 width = compat_get_terminal_size().columns
538 if width is None:
539 width_args = []
540 else:
541 width_args = ['-w', str(width)]
542 sp_kwargs = dict(
543 stdin=subprocess.PIPE,
544 stdout=slave,
545 stderr=self._err_file)
546 try:
547 self._output_process = subprocess.Popen(
548 ['bidiv'] + width_args, **sp_kwargs
549 )
550 except OSError:
551 self._output_process = subprocess.Popen(
552 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
553 self._output_channel = os.fdopen(master, 'rb')
554 except OSError as ose:
555 if ose.errno == errno.ENOENT:
556 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
557 else:
558 raise
559
560 if (sys.platform != 'win32'
561 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
562 and not params.get('restrictfilenames', False)):
563 # Unicode filesystem API will throw errors (#1474, #13027)
564 self.report_warning(
565 'Assuming --restrict-filenames since file system encoding '
566 'cannot encode all characters. '
567 'Set the LC_ALL environment variable to fix this.')
568 self.params['restrictfilenames'] = True
569
570 self.outtmpl_dict = self.parse_outtmpl()
571
572 # Creating format selector here allows us to catch syntax errors before the extraction
573 self.format_selector = (
574 None if self.params.get('format') is None
575 else self.build_format_selector(self.params['format']))
576
577 self._setup_opener()
578
579 """Preload the archive, if any is specified"""
580 def preload_download_archive(fn):
581 if fn is None:
582 return False
583 self.write_debug('Loading archive file %r\n' % fn)
584 try:
585 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
586 for line in archive_file:
587 self.archive.add(line.strip())
588 except IOError as ioe:
589 if ioe.errno != errno.ENOENT:
590 raise
591 return False
592 return True
593
594 self.archive = set()
595 preload_download_archive(self.params.get('download_archive'))
596
597 if auto_init:
598 self.print_debug_header()
599 self.add_default_info_extractors()
600
601 for pp_def_raw in self.params.get('postprocessors', []):
602 pp_def = dict(pp_def_raw)
603 when = pp_def.pop('when', 'post_process')
604 pp_class = get_postprocessor(pp_def.pop('key'))
605 pp = pp_class(self, **compat_kwargs(pp_def))
606 self.add_post_processor(pp, when=when)
607
608 for ph in self.params.get('post_hooks', []):
609 self.add_post_hook(ph)
610
611 for ph in self.params.get('progress_hooks', []):
612 self.add_progress_hook(ph)
613
614 register_socks_protocols()
615
616 def warn_if_short_id(self, argv):
617 # short YouTube ID starting with dash?
618 idxs = [
619 i for i, a in enumerate(argv)
620 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
621 if idxs:
622 correct_argv = (
623 ['yt-dlp']
624 + [a for i, a in enumerate(argv) if i not in idxs]
625 + ['--'] + [argv[i] for i in idxs]
626 )
627 self.report_warning(
628 'Long argument string detected. '
629 'Use -- to separate parameters and URLs, like this:\n%s\n' %
630 args_to_str(correct_argv))
631
632 def add_info_extractor(self, ie):
633 """Add an InfoExtractor object to the end of the list."""
634 ie_key = ie.ie_key()
635 self._ies[ie_key] = ie
636 if not isinstance(ie, type):
637 self._ies_instances[ie_key] = ie
638 ie.set_downloader(self)
639
640 def _get_info_extractor_class(self, ie_key):
641 ie = self._ies.get(ie_key)
642 if ie is None:
643 ie = get_info_extractor(ie_key)
644 self.add_info_extractor(ie)
645 return ie
646
647 def get_info_extractor(self, ie_key):
648 """
649 Get an instance of an IE with name ie_key, it will try to get one from
650 the _ies list, if there's no instance it will create a new one and add
651 it to the extractor list.
652 """
653 ie = self._ies_instances.get(ie_key)
654 if ie is None:
655 ie = get_info_extractor(ie_key)()
656 self.add_info_extractor(ie)
657 return ie
658
659 def add_default_info_extractors(self):
660 """
661 Add the InfoExtractors returned by gen_extractors to the end of the list
662 """
663 for ie in gen_extractor_classes():
664 self.add_info_extractor(ie)
665
666 def add_post_processor(self, pp, when='post_process'):
667 """Add a PostProcessor object to the end of the chain."""
668 self._pps[when].append(pp)
669 pp.set_downloader(self)
670
671 def add_post_hook(self, ph):
672 """Add the post hook"""
673 self._post_hooks.append(ph)
674
675 def add_progress_hook(self, ph):
676 """Add the progress hook (currently only for the file downloader)"""
677 self._progress_hooks.append(ph)
678
679 def _bidi_workaround(self, message):
680 if not hasattr(self, '_output_channel'):
681 return message
682
683 assert hasattr(self, '_output_process')
684 assert isinstance(message, compat_str)
685 line_count = message.count('\n') + 1
686 self._output_process.stdin.write((message + '\n').encode('utf-8'))
687 self._output_process.stdin.flush()
688 res = ''.join(self._output_channel.readline().decode('utf-8')
689 for _ in range(line_count))
690 return res[:-len('\n')]
691
692 def _write_string(self, message, out=None, only_once=False):
693 if only_once:
694 if message in self._printed_messages:
695 return
696 self._printed_messages.add(message)
697 write_string(message, out=out, encoding=self.params.get('encoding'))
698
699 def to_stdout(self, message, skip_eol=False, quiet=False):
700 """Print message to stdout"""
701 if self.params.get('logger'):
702 self.params['logger'].debug(message)
703 elif not quiet or self.params.get('verbose'):
704 self._write_string(
705 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
706 self._err_file if quiet else self._screen_file)
707
708 def to_stderr(self, message, only_once=False):
709 """Print message to stderr"""
710 assert isinstance(message, compat_str)
711 if self.params.get('logger'):
712 self.params['logger'].error(message)
713 else:
714 self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
715
716 def to_console_title(self, message):
717 if not self.params.get('consoletitle', False):
718 return
719 if compat_os_name == 'nt':
720 if ctypes.windll.kernel32.GetConsoleWindow():
721 # c_wchar_p() might not be necessary if `message` is
722 # already of type unicode()
723 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
724 elif 'TERM' in os.environ:
725 self._write_string('\033]0;%s\007' % message, self._screen_file)
726
727 def save_console_title(self):
728 if not self.params.get('consoletitle', False):
729 return
730 if self.params.get('simulate'):
731 return
732 if compat_os_name != 'nt' and 'TERM' in os.environ:
733 # Save the title on stack
734 self._write_string('\033[22;0t', self._screen_file)
735
736 def restore_console_title(self):
737 if not self.params.get('consoletitle', False):
738 return
739 if self.params.get('simulate'):
740 return
741 if compat_os_name != 'nt' and 'TERM' in os.environ:
742 # Restore the title from stack
743 self._write_string('\033[23;0t', self._screen_file)
744
745 def __enter__(self):
746 self.save_console_title()
747 return self
748
749 def __exit__(self, *args):
750 self.restore_console_title()
751
752 if self.params.get('cookiefile') is not None:
753 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
754
755 def trouble(self, message=None, tb=None):
756 """Determine action to take when a download problem appears.
757
758 Depending on if the downloader has been configured to ignore
759 download errors or not, this method may throw an exception or
760 not when errors are found, after printing the message.
761
762 tb, if given, is additional traceback information.
763 """
764 if message is not None:
765 self.to_stderr(message)
766 if self.params.get('verbose'):
767 if tb is None:
768 if sys.exc_info()[0]: # if .trouble has been called from an except block
769 tb = ''
770 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
771 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
772 tb += encode_compat_str(traceback.format_exc())
773 else:
774 tb_data = traceback.format_list(traceback.extract_stack())
775 tb = ''.join(tb_data)
776 if tb:
777 self.to_stderr(tb)
778 if not self.params.get('ignoreerrors', False):
779 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
780 exc_info = sys.exc_info()[1].exc_info
781 else:
782 exc_info = sys.exc_info()
783 raise DownloadError(message, exc_info)
784 self._download_retcode = 1
785
786 def to_screen(self, message, skip_eol=False):
787 """Print message to stdout if not in quiet mode"""
788 self.to_stdout(
789 message, skip_eol, quiet=self.params.get('quiet', False))
790
791 def report_warning(self, message, only_once=False):
792 '''
793 Print the message to stderr, it will be prefixed with 'WARNING:'
794 If stderr is a tty file the 'WARNING:' will be colored
795 '''
796 if self.params.get('logger') is not None:
797 self.params['logger'].warning(message)
798 else:
799 if self.params.get('no_warnings'):
800 return
801 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
802 _msg_header = '\033[0;33mWARNING:\033[0m'
803 else:
804 _msg_header = 'WARNING:'
805 warning_message = '%s %s' % (_msg_header, message)
806 self.to_stderr(warning_message, only_once)
807
808 def report_error(self, message, tb=None):
809 '''
810 Do the same as trouble, but prefixes the message with 'ERROR:', colored
811 in red if stderr is a tty file.
812 '''
813 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
814 _msg_header = '\033[0;31mERROR:\033[0m'
815 else:
816 _msg_header = 'ERROR:'
817 error_message = '%s %s' % (_msg_header, message)
818 self.trouble(error_message, tb)
819
820 def write_debug(self, message, only_once=False):
821 '''Log debug message or Print message to stderr'''
822 if not self.params.get('verbose', False):
823 return
824 message = '[debug] %s' % message
825 if self.params.get('logger'):
826 self.params['logger'].debug(message)
827 else:
828 self.to_stderr(message, only_once)
829
830 def report_file_already_downloaded(self, file_name):
831 """Report file has already been fully downloaded."""
832 try:
833 self.to_screen('[download] %s has already been downloaded' % file_name)
834 except UnicodeEncodeError:
835 self.to_screen('[download] The file has already been downloaded')
836
837 def report_file_delete(self, file_name):
838 """Report that existing file will be deleted."""
839 try:
840 self.to_screen('Deleting existing file %s' % file_name)
841 except UnicodeEncodeError:
842 self.to_screen('Deleting existing file')
843
844 def raise_no_formats(self, info, forced=False):
845 has_drm = info.get('__has_drm')
846 msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
847 expected = self.params.get('ignore_no_formats_error')
848 if forced or not expected:
849 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
850 expected=has_drm or expected)
851 else:
852 self.report_warning(msg)
853
854 def parse_outtmpl(self):
855 outtmpl_dict = self.params.get('outtmpl', {})
856 if not isinstance(outtmpl_dict, dict):
857 outtmpl_dict = {'default': outtmpl_dict}
858 outtmpl_dict.update({
859 k: v for k, v in DEFAULT_OUTTMPL.items()
860 if not outtmpl_dict.get(k)})
861 for key, val in outtmpl_dict.items():
862 if isinstance(val, bytes):
863 self.report_warning(
864 'Parameter outtmpl is bytes, but should be a unicode string. '
865 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
866 return outtmpl_dict
867
868 def get_output_path(self, dir_type='', filename=None):
869 paths = self.params.get('paths', {})
870 assert isinstance(paths, dict)
871 path = os.path.join(
872 expand_path(paths.get('home', '').strip()),
873 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
874 filename or '')
875
876 # Temporary fix for #4787
877 # 'Treat' all problem characters by passing filename through preferredencoding
878 # to workaround encoding issues with subprocess on python2 @ Windows
879 if sys.version_info < (3, 0) and sys.platform == 'win32':
880 path = encodeFilename(path, True).decode(preferredencoding())
881 return sanitize_path(path, force=self.params.get('windowsfilenames'))
882
883 @staticmethod
884 def _outtmpl_expandpath(outtmpl):
885 # expand_path translates '%%' into '%' and '$$' into '$'
886 # correspondingly that is not what we want since we need to keep
887 # '%%' intact for template dict substitution step. Working around
888 # with boundary-alike separator hack.
889 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
890 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
891
892 # outtmpl should be expand_path'ed before template dict substitution
893 # because meta fields may contain env variables we don't want to
894 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
895 # title "Hello $PATH", we don't want `$PATH` to be expanded.
896 return expand_path(outtmpl).replace(sep, '')
897
898 @staticmethod
899 def escape_outtmpl(outtmpl):
900 ''' Escape any remaining strings like %s, %abc% etc. '''
901 return re.sub(
902 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
903 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
904 outtmpl)
905
906 @classmethod
907 def validate_outtmpl(cls, outtmpl):
908 ''' @return None or Exception object '''
909 outtmpl = re.sub(
910 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljq]'),
911 lambda mobj: f'{mobj.group(0)[:-1]}s',
912 cls._outtmpl_expandpath(outtmpl))
913 try:
914 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
915 return None
916 except ValueError as err:
917 return err
918
919 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
920 """ Make the template and info_dict suitable for substitution : ydl.outtmpl_escape(outtmpl) % info_dict """
921 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
922
923 info_dict = dict(info_dict) # Do not sanitize so as not to consume LazyList
924 for key in ('__original_infodict', '__postprocessors'):
925 info_dict.pop(key, None)
926 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
927 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
928 if info_dict.get('duration', None) is not None
929 else None)
930 info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
931 if info_dict.get('resolution') is None:
932 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
933
934 # For fields playlist_index and autonumber convert all occurrences
935 # of %(field)s to %(field)0Nd for backward compatibility
936 field_size_compat_map = {
937 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')),
938 'autonumber': self.params.get('autonumber_size') or 5,
939 }
940
941 TMPL_DICT = {}
942 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljq]'))
943 MATH_FUNCTIONS = {
944 '+': float.__add__,
945 '-': float.__sub__,
946 }
947 # Field is of the form key1.key2...
948 # where keys (except first) can be string, int or slice
949 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
950 MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
951 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
952 INTERNAL_FORMAT_RE = re.compile(r'''(?x)
953 (?P<negate>-)?
954 (?P<fields>{field})
955 (?P<maths>(?:{math_op}{math_field})*)
956 (?:>(?P<strf_format>.+?))?
957 (?:\|(?P<default>.*?))?
958 $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
959
960 def _traverse_infodict(k):
961 k = k.split('.')
962 if k[0] == '':
963 k.pop(0)
964 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
965
966 def get_value(mdict):
967 # Object traversal
968 value = _traverse_infodict(mdict['fields'])
969 # Negative
970 if mdict['negate']:
971 value = float_or_none(value)
972 if value is not None:
973 value *= -1
974 # Do maths
975 offset_key = mdict['maths']
976 if offset_key:
977 value = float_or_none(value)
978 operator = None
979 while offset_key:
980 item = re.match(
981 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
982 offset_key).group(0)
983 offset_key = offset_key[len(item):]
984 if operator is None:
985 operator = MATH_FUNCTIONS[item]
986 continue
987 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
988 offset = float_or_none(item)
989 if offset is None:
990 offset = float_or_none(_traverse_infodict(item))
991 try:
992 value = operator(value, multiplier * offset)
993 except (TypeError, ZeroDivisionError):
994 return None
995 operator = None
996 # Datetime formatting
997 if mdict['strf_format']:
998 value = strftime_or_none(value, mdict['strf_format'])
999
1000 return value
1001
1002 na = self.params.get('outtmpl_na_placeholder', 'NA')
1003
1004 def _dumpjson_default(obj):
1005 if isinstance(obj, (set, LazyList)):
1006 return list(obj)
1007 raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
1008
1009 def create_key(outer_mobj):
1010 if not outer_mobj.group('has_key'):
1011 return f'%{outer_mobj.group(0)}'
1012 key = outer_mobj.group('key')
1013 mobj = re.match(INTERNAL_FORMAT_RE, key)
1014 if mobj is None:
1015 value, default, mobj = None, na, {'fields': ''}
1016 else:
1017 mobj = mobj.groupdict()
1018 default = mobj['default'] if mobj['default'] is not None else na
1019 value = get_value(mobj)
1020
1021 fmt = outer_mobj.group('format')
1022 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1023 fmt = '0{:d}d'.format(field_size_compat_map[key])
1024
1025 value = default if value is None else value
1026
1027 str_fmt = f'{fmt[:-1]}s'
1028 if fmt[-1] == 'l':
1029 value, fmt = ', '.join(variadic(value)), str_fmt
1030 elif fmt[-1] == 'j':
1031 value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
1032 elif fmt[-1] == 'q':
1033 value, fmt = compat_shlex_quote(str(value)), str_fmt
1034 elif fmt[-1] == 'c':
1035 value = str(value)
1036 if value is None:
1037 value, fmt = default, 's'
1038 else:
1039 value = value[0]
1040 elif fmt[-1] not in 'rs': # numeric
1041 value = float_or_none(value)
1042 if value is None:
1043 value, fmt = default, 's'
1044
1045 if sanitize:
1046 if fmt[-1] == 'r':
1047 # If value is an object, sanitize might convert it to a string
1048 # So we convert it to repr first
1049 value, fmt = repr(value), str_fmt
1050 if fmt[-1] in 'csr':
1051 value = sanitize(mobj['fields'].split('.')[-1], value)
1052
1053 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1054 TMPL_DICT[key] = value
1055 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1056
1057 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1058
1059 def _prepare_filename(self, info_dict, tmpl_type='default'):
1060 try:
1061 sanitize = lambda k, v: sanitize_filename(
1062 compat_str(v),
1063 restricted=self.params.get('restrictfilenames'),
1064 is_id=(k == 'id' or k.endswith('_id')))
1065 outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
1066 outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
1067 outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl))
1068 filename = outtmpl % template_dict
1069
1070 force_ext = OUTTMPL_TYPES.get(tmpl_type)
1071 if force_ext is not None:
1072 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1073
1074 # https://github.com/blackjack4494/youtube-dlc/issues/85
1075 trim_file_name = self.params.get('trim_file_name', False)
1076 if trim_file_name:
1077 fn_groups = filename.rsplit('.')
1078 ext = fn_groups[-1]
1079 sub_ext = ''
1080 if len(fn_groups) > 2:
1081 sub_ext = fn_groups[-2]
1082 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
1083
1084 return filename
1085 except ValueError as err:
1086 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1087 return None
1088
1089 def prepare_filename(self, info_dict, dir_type='', warn=False):
1090 """Generate the output filename."""
1091
1092 filename = self._prepare_filename(info_dict, dir_type or 'default')
1093
1094 if warn:
1095 if not self.params.get('paths'):
1096 pass
1097 elif filename == '-':
1098 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1099 elif os.path.isabs(filename):
1100 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1101 if filename == '-' or not filename:
1102 return filename
1103
1104 return self.get_output_path(dir_type, filename)
1105
1106 def _match_entry(self, info_dict, incomplete=False, silent=False):
1107 """ Returns None if the file should be downloaded """
1108
1109 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1110
1111 def check_filter():
1112 if 'title' in info_dict:
1113 # This can happen when we're just evaluating the playlist
1114 title = info_dict['title']
1115 matchtitle = self.params.get('matchtitle', False)
1116 if matchtitle:
1117 if not re.search(matchtitle, title, re.IGNORECASE):
1118 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1119 rejecttitle = self.params.get('rejecttitle', False)
1120 if rejecttitle:
1121 if re.search(rejecttitle, title, re.IGNORECASE):
1122 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1123 date = info_dict.get('upload_date')
1124 if date is not None:
1125 dateRange = self.params.get('daterange', DateRange())
1126 if date not in dateRange:
1127 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1128 view_count = info_dict.get('view_count')
1129 if view_count is not None:
1130 min_views = self.params.get('min_views')
1131 if min_views is not None and view_count < min_views:
1132 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1133 max_views = self.params.get('max_views')
1134 if max_views is not None and view_count > max_views:
1135 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1136 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1137 return 'Skipping "%s" because it is age restricted' % video_title
1138
1139 match_filter = self.params.get('match_filter')
1140 if match_filter is not None:
1141 try:
1142 ret = match_filter(info_dict, incomplete=incomplete)
1143 except TypeError:
1144 # For backward compatibility
1145 ret = None if incomplete else match_filter(info_dict)
1146 if ret is not None:
1147 return ret
1148 return None
1149
1150 if self.in_download_archive(info_dict):
1151 reason = '%s has already been recorded in the archive' % video_title
1152 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1153 else:
1154 reason = check_filter()
1155 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1156 if reason is not None:
1157 if not silent:
1158 self.to_screen('[download] ' + reason)
1159 if self.params.get(break_opt, False):
1160 raise break_err()
1161 return reason
1162
1163 @staticmethod
1164 def add_extra_info(info_dict, extra_info):
1165 '''Set the keys from extra_info in info dict if they are missing'''
1166 for key, value in extra_info.items():
1167 info_dict.setdefault(key, value)
1168
1169 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1170 process=True, force_generic_extractor=False):
1171 """
1172 Return a list with a dictionary for each video extracted.
1173
1174 Arguments:
1175 url -- URL to extract
1176
1177 Keyword arguments:
1178 download -- whether to download videos during extraction
1179 ie_key -- extractor key hint
1180 extra_info -- dictionary containing the extra values to add to each result
1181 process -- whether to resolve all unresolved references (URLs, playlist items),
1182 must be True for download to work.
1183 force_generic_extractor -- force using the generic extractor
1184 """
1185
1186 if extra_info is None:
1187 extra_info = {}
1188
1189 if not ie_key and force_generic_extractor:
1190 ie_key = 'Generic'
1191
1192 if ie_key:
1193 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1194 else:
1195 ies = self._ies
1196
1197 for ie_key, ie in ies.items():
1198 if not ie.suitable(url):
1199 continue
1200
1201 if not ie.working():
1202 self.report_warning('The program functionality for this site has been marked as broken, '
1203 'and will probably not work.')
1204
1205 temp_id = ie.get_temp_id(url)
1206 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1207 self.to_screen("[%s] %s: has already been recorded in archive" % (
1208 ie_key, temp_id))
1209 break
1210 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1211 else:
1212 self.report_error('no suitable InfoExtractor for URL %s' % url)
1213
1214 def __handle_extraction_exceptions(func):
1215
1216 def wrapper(self, *args, **kwargs):
1217 try:
1218 return func(self, *args, **kwargs)
1219 except GeoRestrictedError as e:
1220 msg = e.msg
1221 if e.countries:
1222 msg += '\nThis video is available in %s.' % ', '.join(
1223 map(ISO3166Utils.short2full, e.countries))
1224 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1225 self.report_error(msg)
1226 except ExtractorError as e: # An error we somewhat expected
1227 self.report_error(compat_str(e), e.format_traceback())
1228 except ThrottledDownload:
1229 self.to_stderr('\r')
1230 self.report_warning('The download speed is below throttle limit. Re-extracting data')
1231 return wrapper(self, *args, **kwargs)
1232 except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError):
1233 raise
1234 except Exception as e:
1235 if self.params.get('ignoreerrors', False):
1236 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1237 else:
1238 raise
1239 return wrapper
1240
1241 @__handle_extraction_exceptions
1242 def __extract_info(self, url, ie, download, extra_info, process):
1243 ie_result = ie.extract(url)
1244 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1245 return
1246 if isinstance(ie_result, list):
1247 # Backwards compatibility: old IE result format
1248 ie_result = {
1249 '_type': 'compat_list',
1250 'entries': ie_result,
1251 }
1252 if extra_info.get('original_url'):
1253 ie_result.setdefault('original_url', extra_info['original_url'])
1254 self.add_default_extra_info(ie_result, ie, url)
1255 if process:
1256 return self.process_ie_result(ie_result, download, extra_info)
1257 else:
1258 return ie_result
1259
1260 def add_default_extra_info(self, ie_result, ie, url):
1261 if url is not None:
1262 self.add_extra_info(ie_result, {
1263 'webpage_url': url,
1264 'original_url': url,
1265 'webpage_url_basename': url_basename(url),
1266 })
1267 if ie is not None:
1268 self.add_extra_info(ie_result, {
1269 'extractor': ie.IE_NAME,
1270 'extractor_key': ie.ie_key(),
1271 })
1272
1273 def process_ie_result(self, ie_result, download=True, extra_info=None):
1274 """
1275 Take the result of the ie(may be modified) and resolve all unresolved
1276 references (URLs, playlist items).
1277
1278 It will also download the videos if 'download'.
1279 Returns the resolved ie_result.
1280 """
1281 if extra_info is None:
1282 extra_info = {}
1283 result_type = ie_result.get('_type', 'video')
1284
1285 if result_type in ('url', 'url_transparent'):
1286 ie_result['url'] = sanitize_url(ie_result['url'])
1287 if ie_result.get('original_url'):
1288 extra_info.setdefault('original_url', ie_result['original_url'])
1289
1290 extract_flat = self.params.get('extract_flat', False)
1291 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1292 or extract_flat is True):
1293 info_copy = ie_result.copy()
1294 self.add_extra_info(info_copy, extra_info)
1295 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1296 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1297 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1298 return ie_result
1299
1300 if result_type == 'video':
1301 self.add_extra_info(ie_result, extra_info)
1302 ie_result = self.process_video_result(ie_result, download=download)
1303 additional_urls = (ie_result or {}).get('additional_urls')
1304 if additional_urls:
1305 # TODO: Improve MetadataParserPP to allow setting a list
1306 if isinstance(additional_urls, compat_str):
1307 additional_urls = [additional_urls]
1308 self.to_screen(
1309 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1310 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1311 ie_result['additional_entries'] = [
1312 self.extract_info(
1313 url, download, extra_info,
1314 force_generic_extractor=self.params.get('force_generic_extractor'))
1315 for url in additional_urls
1316 ]
1317 return ie_result
1318 elif result_type == 'url':
1319 # We have to add extra_info to the results because it may be
1320 # contained in a playlist
1321 return self.extract_info(
1322 ie_result['url'], download,
1323 ie_key=ie_result.get('ie_key'),
1324 extra_info=extra_info)
1325 elif result_type == 'url_transparent':
1326 # Use the information from the embedding page
1327 info = self.extract_info(
1328 ie_result['url'], ie_key=ie_result.get('ie_key'),
1329 extra_info=extra_info, download=False, process=False)
1330
1331 # extract_info may return None when ignoreerrors is enabled and
1332 # extraction failed with an error, don't crash and return early
1333 # in this case
1334 if not info:
1335 return info
1336
1337 force_properties = dict(
1338 (k, v) for k, v in ie_result.items() if v is not None)
1339 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1340 if f in force_properties:
1341 del force_properties[f]
1342 new_result = info.copy()
1343 new_result.update(force_properties)
1344
1345 # Extracted info may not be a video result (i.e.
1346 # info.get('_type', 'video') != video) but rather an url or
1347 # url_transparent. In such cases outer metadata (from ie_result)
1348 # should be propagated to inner one (info). For this to happen
1349 # _type of info should be overridden with url_transparent. This
1350 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1351 if new_result.get('_type') == 'url':
1352 new_result['_type'] = 'url_transparent'
1353
1354 return self.process_ie_result(
1355 new_result, download=download, extra_info=extra_info)
1356 elif result_type in ('playlist', 'multi_video'):
1357 # Protect from infinite recursion due to recursively nested playlists
1358 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1359 webpage_url = ie_result['webpage_url']
1360 if webpage_url in self._playlist_urls:
1361 self.to_screen(
1362 '[download] Skipping already downloaded playlist: %s'
1363 % ie_result.get('title') or ie_result.get('id'))
1364 return
1365
1366 self._playlist_level += 1
1367 self._playlist_urls.add(webpage_url)
1368 self._sanitize_thumbnails(ie_result)
1369 try:
1370 return self.__process_playlist(ie_result, download)
1371 finally:
1372 self._playlist_level -= 1
1373 if not self._playlist_level:
1374 self._playlist_urls.clear()
1375 elif result_type == 'compat_list':
1376 self.report_warning(
1377 'Extractor %s returned a compat_list result. '
1378 'It needs to be updated.' % ie_result.get('extractor'))
1379
1380 def _fixup(r):
1381 self.add_extra_info(r, {
1382 'extractor': ie_result['extractor'],
1383 'webpage_url': ie_result['webpage_url'],
1384 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1385 'extractor_key': ie_result['extractor_key'],
1386 })
1387 return r
1388 ie_result['entries'] = [
1389 self.process_ie_result(_fixup(r), download, extra_info)
1390 for r in ie_result['entries']
1391 ]
1392 return ie_result
1393 else:
1394 raise Exception('Invalid result type: %s' % result_type)
1395
1396 def _ensure_dir_exists(self, path):
1397 return make_dir(path, self.report_error)
1398
1399 def __process_playlist(self, ie_result, download):
1400 # We process each entry in the playlist
1401 playlist = ie_result.get('title') or ie_result.get('id')
1402 self.to_screen('[download] Downloading playlist: %s' % playlist)
1403
1404 if 'entries' not in ie_result:
1405 raise EntryNotInPlaylist()
1406 incomplete_entries = bool(ie_result.get('requested_entries'))
1407 if incomplete_entries:
1408 def fill_missing_entries(entries, indexes):
1409 ret = [None] * max(*indexes)
1410 for i, entry in zip(indexes, entries):
1411 ret[i - 1] = entry
1412 return ret
1413 ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1414
1415 playlist_results = []
1416
1417 playliststart = self.params.get('playliststart', 1)
1418 playlistend = self.params.get('playlistend')
1419 # For backwards compatibility, interpret -1 as whole list
1420 if playlistend == -1:
1421 playlistend = None
1422
1423 playlistitems_str = self.params.get('playlist_items')
1424 playlistitems = None
1425 if playlistitems_str is not None:
1426 def iter_playlistitems(format):
1427 for string_segment in format.split(','):
1428 if '-' in string_segment:
1429 start, end = string_segment.split('-')
1430 for item in range(int(start), int(end) + 1):
1431 yield int(item)
1432 else:
1433 yield int(string_segment)
1434 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1435
1436 ie_entries = ie_result['entries']
1437 msg = (
1438 'Downloading %d videos' if not isinstance(ie_entries, list)
1439 else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1440
1441 if isinstance(ie_entries, list):
1442 def get_entry(i):
1443 return ie_entries[i - 1]
1444 else:
1445 if not isinstance(ie_entries, PagedList):
1446 ie_entries = LazyList(ie_entries)
1447
1448 def get_entry(i):
1449 return YoutubeDL.__handle_extraction_exceptions(
1450 lambda self, i: ie_entries[i - 1]
1451 )(self, i)
1452
1453 entries = []
1454 for i in playlistitems or itertools.count(playliststart):
1455 if playlistitems is None and playlistend is not None and playlistend < i:
1456 break
1457 entry = None
1458 try:
1459 entry = get_entry(i)
1460 if entry is None:
1461 raise EntryNotInPlaylist()
1462 except (IndexError, EntryNotInPlaylist):
1463 if incomplete_entries:
1464 raise EntryNotInPlaylist()
1465 elif not playlistitems:
1466 break
1467 entries.append(entry)
1468 try:
1469 if entry is not None:
1470 self._match_entry(entry, incomplete=True, silent=True)
1471 except (ExistingVideoReached, RejectedVideoReached):
1472 break
1473 ie_result['entries'] = entries
1474
1475 # Save playlist_index before re-ordering
1476 entries = [
1477 ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry)
1478 for i, entry in enumerate(entries, 1)
1479 if entry is not None]
1480 n_entries = len(entries)
1481
1482 if not playlistitems and (playliststart or playlistend):
1483 playlistitems = list(range(playliststart, playliststart + n_entries))
1484 ie_result['requested_entries'] = playlistitems
1485
1486 if self.params.get('allow_playlist_files', True):
1487 ie_copy = {
1488 'playlist': playlist,
1489 'playlist_id': ie_result.get('id'),
1490 'playlist_title': ie_result.get('title'),
1491 'playlist_uploader': ie_result.get('uploader'),
1492 'playlist_uploader_id': ie_result.get('uploader_id'),
1493 'playlist_index': 0,
1494 }
1495 ie_copy.update(dict(ie_result))
1496
1497 if self.params.get('writeinfojson', False):
1498 infofn = self.prepare_filename(ie_copy, 'pl_infojson')
1499 if not self._ensure_dir_exists(encodeFilename(infofn)):
1500 return
1501 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
1502 self.to_screen('[info] Playlist metadata is already present')
1503 else:
1504 self.to_screen('[info] Writing playlist metadata as JSON to: ' + infofn)
1505 try:
1506 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
1507 except (OSError, IOError):
1508 self.report_error('Cannot write playlist metadata to JSON file ' + infofn)
1509
1510 # TODO: This should be passed to ThumbnailsConvertor if necessary
1511 self._write_thumbnails(ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1512
1513 if self.params.get('writedescription', False):
1514 descfn = self.prepare_filename(ie_copy, 'pl_description')
1515 if not self._ensure_dir_exists(encodeFilename(descfn)):
1516 return
1517 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
1518 self.to_screen('[info] Playlist description is already present')
1519 elif ie_result.get('description') is None:
1520 self.report_warning('There\'s no playlist description to write.')
1521 else:
1522 try:
1523 self.to_screen('[info] Writing playlist description to: ' + descfn)
1524 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1525 descfile.write(ie_result['description'])
1526 except (OSError, IOError):
1527 self.report_error('Cannot write playlist description file ' + descfn)
1528 return
1529
1530 if self.params.get('playlistreverse', False):
1531 entries = entries[::-1]
1532 if self.params.get('playlistrandom', False):
1533 random.shuffle(entries)
1534
1535 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1536
1537 self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1538 failures = 0
1539 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1540 for i, entry_tuple in enumerate(entries, 1):
1541 playlist_index, entry = entry_tuple
1542 if 'playlist-index' in self.params.get('compat_opts', []):
1543 playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1
1544 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1545 # This __x_forwarded_for_ip thing is a bit ugly but requires
1546 # minimal changes
1547 if x_forwarded_for:
1548 entry['__x_forwarded_for_ip'] = x_forwarded_for
1549 extra = {
1550 'n_entries': n_entries,
1551 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1552 'playlist_index': playlist_index,
1553 'playlist_autonumber': i,
1554 'playlist': playlist,
1555 'playlist_id': ie_result.get('id'),
1556 'playlist_title': ie_result.get('title'),
1557 'playlist_uploader': ie_result.get('uploader'),
1558 'playlist_uploader_id': ie_result.get('uploader_id'),
1559 'extractor': ie_result['extractor'],
1560 'webpage_url': ie_result['webpage_url'],
1561 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1562 'extractor_key': ie_result['extractor_key'],
1563 }
1564
1565 if self._match_entry(entry, incomplete=True) is not None:
1566 continue
1567
1568 entry_result = self.__process_iterable_entry(entry, download, extra)
1569 if not entry_result:
1570 failures += 1
1571 if failures >= max_failures:
1572 self.report_error(
1573 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1574 break
1575 # TODO: skip failed (empty) entries?
1576 playlist_results.append(entry_result)
1577 ie_result['entries'] = playlist_results
1578 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1579 return ie_result
1580
1581 @__handle_extraction_exceptions
1582 def __process_iterable_entry(self, entry, download, extra_info):
1583 return self.process_ie_result(
1584 entry, download=download, extra_info=extra_info)
1585
1586 def _build_format_filter(self, filter_spec):
1587 " Returns a function to filter the formats according to the filter_spec "
1588
1589 OPERATORS = {
1590 '<': operator.lt,
1591 '<=': operator.le,
1592 '>': operator.gt,
1593 '>=': operator.ge,
1594 '=': operator.eq,
1595 '!=': operator.ne,
1596 }
1597 operator_rex = re.compile(r'''(?x)\s*
1598 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1599 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1600 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1601 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1602 m = operator_rex.fullmatch(filter_spec)
1603 if m:
1604 try:
1605 comparison_value = int(m.group('value'))
1606 except ValueError:
1607 comparison_value = parse_filesize(m.group('value'))
1608 if comparison_value is None:
1609 comparison_value = parse_filesize(m.group('value') + 'B')
1610 if comparison_value is None:
1611 raise ValueError(
1612 'Invalid value %r in format specification %r' % (
1613 m.group('value'), filter_spec))
1614 op = OPERATORS[m.group('op')]
1615
1616 if not m:
1617 STR_OPERATORS = {
1618 '=': operator.eq,
1619 '^=': lambda attr, value: attr.startswith(value),
1620 '$=': lambda attr, value: attr.endswith(value),
1621 '*=': lambda attr, value: value in attr,
1622 }
1623 str_operator_rex = re.compile(r'''(?x)\s*
1624 (?P<key>[a-zA-Z0-9._-]+)\s*
1625 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1626 (?P<value>[a-zA-Z0-9._-]+)\s*
1627 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1628 m = str_operator_rex.fullmatch(filter_spec)
1629 if m:
1630 comparison_value = m.group('value')
1631 str_op = STR_OPERATORS[m.group('op')]
1632 if m.group('negation'):
1633 op = lambda attr, value: not str_op(attr, value)
1634 else:
1635 op = str_op
1636
1637 if not m:
1638 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1639
1640 def _filter(f):
1641 actual_value = f.get(m.group('key'))
1642 if actual_value is None:
1643 return m.group('none_inclusive')
1644 return op(actual_value, comparison_value)
1645 return _filter
1646
1647 def _default_format_spec(self, info_dict, download=True):
1648
1649 def can_merge():
1650 merger = FFmpegMergerPP(self)
1651 return merger.available and merger.can_merge()
1652
1653 prefer_best = (
1654 not self.params.get('simulate')
1655 and download
1656 and (
1657 not can_merge()
1658 or info_dict.get('is_live', False)
1659 or self.outtmpl_dict['default'] == '-'))
1660 compat = (
1661 prefer_best
1662 or self.params.get('allow_multiple_audio_streams', False)
1663 or 'format-spec' in self.params.get('compat_opts', []))
1664
1665 return (
1666 'best/bestvideo+bestaudio' if prefer_best
1667 else 'bestvideo*+bestaudio/best' if not compat
1668 else 'bestvideo+bestaudio/best')
1669
1670 def build_format_selector(self, format_spec):
1671 def syntax_error(note, start):
1672 message = (
1673 'Invalid format specification: '
1674 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1675 return SyntaxError(message)
1676
1677 PICKFIRST = 'PICKFIRST'
1678 MERGE = 'MERGE'
1679 SINGLE = 'SINGLE'
1680 GROUP = 'GROUP'
1681 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1682
1683 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1684 'video': self.params.get('allow_multiple_video_streams', False)}
1685
1686 check_formats = self.params.get('check_formats')
1687
1688 def _parse_filter(tokens):
1689 filter_parts = []
1690 for type, string, start, _, _ in tokens:
1691 if type == tokenize.OP and string == ']':
1692 return ''.join(filter_parts)
1693 else:
1694 filter_parts.append(string)
1695
1696 def _remove_unused_ops(tokens):
1697 # Remove operators that we don't use and join them with the surrounding strings
1698 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1699 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1700 last_string, last_start, last_end, last_line = None, None, None, None
1701 for type, string, start, end, line in tokens:
1702 if type == tokenize.OP and string == '[':
1703 if last_string:
1704 yield tokenize.NAME, last_string, last_start, last_end, last_line
1705 last_string = None
1706 yield type, string, start, end, line
1707 # everything inside brackets will be handled by _parse_filter
1708 for type, string, start, end, line in tokens:
1709 yield type, string, start, end, line
1710 if type == tokenize.OP and string == ']':
1711 break
1712 elif type == tokenize.OP and string in ALLOWED_OPS:
1713 if last_string:
1714 yield tokenize.NAME, last_string, last_start, last_end, last_line
1715 last_string = None
1716 yield type, string, start, end, line
1717 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1718 if not last_string:
1719 last_string = string
1720 last_start = start
1721 last_end = end
1722 else:
1723 last_string += string
1724 if last_string:
1725 yield tokenize.NAME, last_string, last_start, last_end, last_line
1726
1727 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1728 selectors = []
1729 current_selector = None
1730 for type, string, start, _, _ in tokens:
1731 # ENCODING is only defined in python 3.x
1732 if type == getattr(tokenize, 'ENCODING', None):
1733 continue
1734 elif type in [tokenize.NAME, tokenize.NUMBER]:
1735 current_selector = FormatSelector(SINGLE, string, [])
1736 elif type == tokenize.OP:
1737 if string == ')':
1738 if not inside_group:
1739 # ')' will be handled by the parentheses group
1740 tokens.restore_last_token()
1741 break
1742 elif inside_merge and string in ['/', ',']:
1743 tokens.restore_last_token()
1744 break
1745 elif inside_choice and string == ',':
1746 tokens.restore_last_token()
1747 break
1748 elif string == ',':
1749 if not current_selector:
1750 raise syntax_error('"," must follow a format selector', start)
1751 selectors.append(current_selector)
1752 current_selector = None
1753 elif string == '/':
1754 if not current_selector:
1755 raise syntax_error('"/" must follow a format selector', start)
1756 first_choice = current_selector
1757 second_choice = _parse_format_selection(tokens, inside_choice=True)
1758 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1759 elif string == '[':
1760 if not current_selector:
1761 current_selector = FormatSelector(SINGLE, 'best', [])
1762 format_filter = _parse_filter(tokens)
1763 current_selector.filters.append(format_filter)
1764 elif string == '(':
1765 if current_selector:
1766 raise syntax_error('Unexpected "("', start)
1767 group = _parse_format_selection(tokens, inside_group=True)
1768 current_selector = FormatSelector(GROUP, group, [])
1769 elif string == '+':
1770 if not current_selector:
1771 raise syntax_error('Unexpected "+"', start)
1772 selector_1 = current_selector
1773 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1774 if not selector_2:
1775 raise syntax_error('Expected a selector', start)
1776 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1777 else:
1778 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1779 elif type == tokenize.ENDMARKER:
1780 break
1781 if current_selector:
1782 selectors.append(current_selector)
1783 return selectors
1784
1785 def _merge(formats_pair):
1786 format_1, format_2 = formats_pair
1787
1788 formats_info = []
1789 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1790 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1791
1792 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1793 get_no_more = {'video': False, 'audio': False}
1794 for (i, fmt_info) in enumerate(formats_info):
1795 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1796 formats_info.pop(i)
1797 continue
1798 for aud_vid in ['audio', 'video']:
1799 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1800 if get_no_more[aud_vid]:
1801 formats_info.pop(i)
1802 break
1803 get_no_more[aud_vid] = True
1804
1805 if len(formats_info) == 1:
1806 return formats_info[0]
1807
1808 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1809 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1810
1811 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1812 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1813
1814 output_ext = self.params.get('merge_output_format')
1815 if not output_ext:
1816 if the_only_video:
1817 output_ext = the_only_video['ext']
1818 elif the_only_audio and not video_fmts:
1819 output_ext = the_only_audio['ext']
1820 else:
1821 output_ext = 'mkv'
1822
1823 new_dict = {
1824 'requested_formats': formats_info,
1825 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1826 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1827 'ext': output_ext,
1828 }
1829
1830 if the_only_video:
1831 new_dict.update({
1832 'width': the_only_video.get('width'),
1833 'height': the_only_video.get('height'),
1834 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
1835 'fps': the_only_video.get('fps'),
1836 'vcodec': the_only_video.get('vcodec'),
1837 'vbr': the_only_video.get('vbr'),
1838 'stretched_ratio': the_only_video.get('stretched_ratio'),
1839 })
1840
1841 if the_only_audio:
1842 new_dict.update({
1843 'acodec': the_only_audio.get('acodec'),
1844 'abr': the_only_audio.get('abr'),
1845 })
1846
1847 return new_dict
1848
1849 def _check_formats(formats):
1850 if not check_formats:
1851 yield from formats
1852 return
1853 for f in formats:
1854 self.to_screen('[info] Testing format %s' % f['format_id'])
1855 temp_file = tempfile.NamedTemporaryFile(
1856 suffix='.tmp', delete=False,
1857 dir=self.get_output_path('temp') or None)
1858 temp_file.close()
1859 try:
1860 success, _ = self.dl(temp_file.name, f, test=True)
1861 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1862 success = False
1863 finally:
1864 if os.path.exists(temp_file.name):
1865 try:
1866 os.remove(temp_file.name)
1867 except OSError:
1868 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1869 if success:
1870 yield f
1871 else:
1872 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1873
1874 def _build_selector_function(selector):
1875 if isinstance(selector, list): # ,
1876 fs = [_build_selector_function(s) for s in selector]
1877
1878 def selector_function(ctx):
1879 for f in fs:
1880 yield from f(ctx)
1881 return selector_function
1882
1883 elif selector.type == GROUP: # ()
1884 selector_function = _build_selector_function(selector.selector)
1885
1886 elif selector.type == PICKFIRST: # /
1887 fs = [_build_selector_function(s) for s in selector.selector]
1888
1889 def selector_function(ctx):
1890 for f in fs:
1891 picked_formats = list(f(ctx))
1892 if picked_formats:
1893 return picked_formats
1894 return []
1895
1896 elif selector.type == MERGE: # +
1897 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1898
1899 def selector_function(ctx):
1900 for pair in itertools.product(
1901 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1902 yield _merge(pair)
1903
1904 elif selector.type == SINGLE: # atom
1905 format_spec = selector.selector or 'best'
1906
1907 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
1908 if format_spec == 'all':
1909 def selector_function(ctx):
1910 yield from _check_formats(ctx['formats'])
1911 elif format_spec == 'mergeall':
1912 def selector_function(ctx):
1913 formats = list(_check_formats(ctx['formats']))
1914 if not formats:
1915 return
1916 merged_format = formats[-1]
1917 for f in formats[-2::-1]:
1918 merged_format = _merge((merged_format, f))
1919 yield merged_format
1920
1921 else:
1922 format_fallback, format_reverse, format_idx = False, True, 1
1923 mobj = re.match(
1924 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
1925 format_spec)
1926 if mobj is not None:
1927 format_idx = int_or_none(mobj.group('n'), default=1)
1928 format_reverse = mobj.group('bw')[0] == 'b'
1929 format_type = (mobj.group('type') or [None])[0]
1930 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
1931 format_modified = mobj.group('mod') is not None
1932
1933 format_fallback = not format_type and not format_modified # for b, w
1934 _filter_f = (
1935 (lambda f: f.get('%scodec' % format_type) != 'none')
1936 if format_type and format_modified # bv*, ba*, wv*, wa*
1937 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
1938 if format_type # bv, ba, wv, wa
1939 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1940 if not format_modified # b, w
1941 else lambda f: True) # b*, w*
1942 filter_f = lambda f: _filter_f(f) and (
1943 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
1944 else:
1945 filter_f = ((lambda f: f.get('ext') == format_spec)
1946 if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension
1947 else (lambda f: f.get('format_id') == format_spec)) # id
1948
1949 def selector_function(ctx):
1950 formats = list(ctx['formats'])
1951 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1952 if format_fallback and ctx['incomplete_formats'] and not matches:
1953 # for extractors with incomplete formats (audio only (soundcloud)
1954 # or video only (imgur)) best/worst will fallback to
1955 # best/worst {video,audio}-only format
1956 matches = formats
1957 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
1958 try:
1959 yield matches[format_idx - 1]
1960 except IndexError:
1961 return
1962
1963 filters = [self._build_format_filter(f) for f in selector.filters]
1964
1965 def final_selector(ctx):
1966 ctx_copy = copy.deepcopy(ctx)
1967 for _filter in filters:
1968 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1969 return selector_function(ctx_copy)
1970 return final_selector
1971
1972 stream = io.BytesIO(format_spec.encode('utf-8'))
1973 try:
1974 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1975 except tokenize.TokenError:
1976 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1977
1978 class TokenIterator(object):
1979 def __init__(self, tokens):
1980 self.tokens = tokens
1981 self.counter = 0
1982
1983 def __iter__(self):
1984 return self
1985
1986 def __next__(self):
1987 if self.counter >= len(self.tokens):
1988 raise StopIteration()
1989 value = self.tokens[self.counter]
1990 self.counter += 1
1991 return value
1992
1993 next = __next__
1994
1995 def restore_last_token(self):
1996 self.counter -= 1
1997
1998 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1999 return _build_selector_function(parsed_selector)
2000
2001 def _calc_headers(self, info_dict):
2002 res = std_headers.copy()
2003
2004 add_headers = info_dict.get('http_headers')
2005 if add_headers:
2006 res.update(add_headers)
2007
2008 cookies = self._calc_cookies(info_dict)
2009 if cookies:
2010 res['Cookie'] = cookies
2011
2012 if 'X-Forwarded-For' not in res:
2013 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2014 if x_forwarded_for_ip:
2015 res['X-Forwarded-For'] = x_forwarded_for_ip
2016
2017 return res
2018
2019 def _calc_cookies(self, info_dict):
2020 pr = sanitized_Request(info_dict['url'])
2021 self.cookiejar.add_cookie_header(pr)
2022 return pr.get_header('Cookie')
2023
2024 def _sanitize_thumbnails(self, info_dict):
2025 thumbnails = info_dict.get('thumbnails')
2026 if thumbnails is None:
2027 thumbnail = info_dict.get('thumbnail')
2028 if thumbnail:
2029 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2030 if thumbnails:
2031 thumbnails.sort(key=lambda t: (
2032 t.get('preference') if t.get('preference') is not None else -1,
2033 t.get('width') if t.get('width') is not None else -1,
2034 t.get('height') if t.get('height') is not None else -1,
2035 t.get('id') if t.get('id') is not None else '',
2036 t.get('url')))
2037
2038 def thumbnail_tester():
2039 if self.params.get('check_formats'):
2040 test_all = True
2041 to_screen = lambda msg: self.to_screen(f'[info] {msg}')
2042 else:
2043 test_all = False
2044 to_screen = self.write_debug
2045
2046 def test_thumbnail(t):
2047 if not test_all and not t.get('_test_url'):
2048 return True
2049 to_screen('Testing thumbnail %s' % t['id'])
2050 try:
2051 self.urlopen(HEADRequest(t['url']))
2052 except network_exceptions as err:
2053 to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % (
2054 t['id'], t['url'], error_to_compat_str(err)))
2055 return False
2056 return True
2057
2058 return test_thumbnail
2059
2060 for i, t in enumerate(thumbnails):
2061 if t.get('id') is None:
2062 t['id'] = '%d' % i
2063 if t.get('width') and t.get('height'):
2064 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2065 t['url'] = sanitize_url(t['url'])
2066
2067 if self.params.get('check_formats') is not False:
2068 info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse()
2069 else:
2070 info_dict['thumbnails'] = thumbnails
2071
2072 def process_video_result(self, info_dict, download=True):
2073 assert info_dict.get('_type', 'video') == 'video'
2074
2075 if 'id' not in info_dict:
2076 raise ExtractorError('Missing "id" field in extractor result')
2077 if 'title' not in info_dict:
2078 raise ExtractorError('Missing "title" field in extractor result',
2079 video_id=info_dict['id'], ie=info_dict['extractor'])
2080
2081 def report_force_conversion(field, field_not, conversion):
2082 self.report_warning(
2083 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2084 % (field, field_not, conversion))
2085
2086 def sanitize_string_field(info, string_field):
2087 field = info.get(string_field)
2088 if field is None or isinstance(field, compat_str):
2089 return
2090 report_force_conversion(string_field, 'a string', 'string')
2091 info[string_field] = compat_str(field)
2092
2093 def sanitize_numeric_fields(info):
2094 for numeric_field in self._NUMERIC_FIELDS:
2095 field = info.get(numeric_field)
2096 if field is None or isinstance(field, compat_numeric_types):
2097 continue
2098 report_force_conversion(numeric_field, 'numeric', 'int')
2099 info[numeric_field] = int_or_none(field)
2100
2101 sanitize_string_field(info_dict, 'id')
2102 sanitize_numeric_fields(info_dict)
2103
2104 if 'playlist' not in info_dict:
2105 # It isn't part of a playlist
2106 info_dict['playlist'] = None
2107 info_dict['playlist_index'] = None
2108
2109 self._sanitize_thumbnails(info_dict)
2110
2111 thumbnail = info_dict.get('thumbnail')
2112 thumbnails = info_dict.get('thumbnails')
2113 if thumbnail:
2114 info_dict['thumbnail'] = sanitize_url(thumbnail)
2115 elif thumbnails:
2116 info_dict['thumbnail'] = thumbnails[-1]['url']
2117
2118 if info_dict.get('display_id') is None and 'id' in info_dict:
2119 info_dict['display_id'] = info_dict['id']
2120
2121 for ts_key, date_key in (
2122 ('timestamp', 'upload_date'),
2123 ('release_timestamp', 'release_date'),
2124 ):
2125 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2126 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2127 # see http://bugs.python.org/issue1646728)
2128 try:
2129 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2130 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2131 except (ValueError, OverflowError, OSError):
2132 pass
2133
2134 live_keys = ('is_live', 'was_live')
2135 live_status = info_dict.get('live_status')
2136 if live_status is None:
2137 for key in live_keys:
2138 if info_dict.get(key) is False:
2139 continue
2140 if info_dict.get(key):
2141 live_status = key
2142 break
2143 if all(info_dict.get(key) is False for key in live_keys):
2144 live_status = 'not_live'
2145 if live_status:
2146 info_dict['live_status'] = live_status
2147 for key in live_keys:
2148 if info_dict.get(key) is None:
2149 info_dict[key] = (live_status == key)
2150
2151 # Auto generate title fields corresponding to the *_number fields when missing
2152 # in order to always have clean titles. This is very common for TV series.
2153 for field in ('chapter', 'season', 'episode'):
2154 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2155 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2156
2157 for cc_kind in ('subtitles', 'automatic_captions'):
2158 cc = info_dict.get(cc_kind)
2159 if cc:
2160 for _, subtitle in cc.items():
2161 for subtitle_format in subtitle:
2162 if subtitle_format.get('url'):
2163 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2164 if subtitle_format.get('ext') is None:
2165 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2166
2167 automatic_captions = info_dict.get('automatic_captions')
2168 subtitles = info_dict.get('subtitles')
2169
2170 info_dict['requested_subtitles'] = self.process_subtitles(
2171 info_dict['id'], subtitles, automatic_captions)
2172
2173 # We now pick which formats have to be downloaded
2174 if info_dict.get('formats') is None:
2175 # There's only one format available
2176 formats = [info_dict]
2177 else:
2178 formats = info_dict['formats']
2179
2180 info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
2181 if not self.params.get('allow_unplayable_formats'):
2182 formats = [f for f in formats if not f.get('has_drm')]
2183
2184 if not formats:
2185 self.raise_no_formats(info_dict)
2186
2187 def is_wellformed(f):
2188 url = f.get('url')
2189 if not url:
2190 self.report_warning(
2191 '"url" field is missing or empty - skipping format, '
2192 'there is an error in extractor')
2193 return False
2194 if isinstance(url, bytes):
2195 sanitize_string_field(f, 'url')
2196 return True
2197
2198 # Filter out malformed formats for better extraction robustness
2199 formats = list(filter(is_wellformed, formats))
2200
2201 formats_dict = {}
2202
2203 # We check that all the formats have the format and format_id fields
2204 for i, format in enumerate(formats):
2205 sanitize_string_field(format, 'format_id')
2206 sanitize_numeric_fields(format)
2207 format['url'] = sanitize_url(format['url'])
2208 if not format.get('format_id'):
2209 format['format_id'] = compat_str(i)
2210 else:
2211 # Sanitize format_id from characters used in format selector expression
2212 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2213 format_id = format['format_id']
2214 if format_id not in formats_dict:
2215 formats_dict[format_id] = []
2216 formats_dict[format_id].append(format)
2217
2218 # Make sure all formats have unique format_id
2219 for format_id, ambiguous_formats in formats_dict.items():
2220 if len(ambiguous_formats) > 1:
2221 for i, format in enumerate(ambiguous_formats):
2222 format['format_id'] = '%s-%d' % (format_id, i)
2223
2224 for i, format in enumerate(formats):
2225 if format.get('format') is None:
2226 format['format'] = '{id} - {res}{note}'.format(
2227 id=format['format_id'],
2228 res=self.format_resolution(format),
2229 note=format_field(format, 'format_note', ' (%s)'),
2230 )
2231 # Automatically determine file extension if missing
2232 if format.get('ext') is None:
2233 format['ext'] = determine_ext(format['url']).lower()
2234 # Automatically determine protocol if missing (useful for format
2235 # selection purposes)
2236 if format.get('protocol') is None:
2237 format['protocol'] = determine_protocol(format)
2238 # Add HTTP headers, so that external programs can use them from the
2239 # json output
2240 full_format_info = info_dict.copy()
2241 full_format_info.update(format)
2242 format['http_headers'] = self._calc_headers(full_format_info)
2243 # Remove private housekeeping stuff
2244 if '__x_forwarded_for_ip' in info_dict:
2245 del info_dict['__x_forwarded_for_ip']
2246
2247 # TODO Central sorting goes here
2248
2249 if not formats or formats[0] is not info_dict:
2250 # only set the 'formats' fields if the original info_dict list them
2251 # otherwise we end up with a circular reference, the first (and unique)
2252 # element in the 'formats' field in info_dict is info_dict itself,
2253 # which can't be exported to json
2254 info_dict['formats'] = formats
2255
2256 info_dict, _ = self.pre_process(info_dict)
2257
2258 if self.params.get('list_thumbnails'):
2259 self.list_thumbnails(info_dict)
2260 if self.params.get('listformats'):
2261 if not info_dict.get('formats') and not info_dict.get('url'):
2262 self.to_screen('%s has no formats' % info_dict['id'])
2263 else:
2264 self.list_formats(info_dict)
2265 if self.params.get('listsubtitles'):
2266 if 'automatic_captions' in info_dict:
2267 self.list_subtitles(
2268 info_dict['id'], automatic_captions, 'automatic captions')
2269 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2270 list_only = self.params.get('simulate') is None and (
2271 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2272 if list_only:
2273 # Without this printing, -F --print-json will not work
2274 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2275 return
2276
2277 format_selector = self.format_selector
2278 if format_selector is None:
2279 req_format = self._default_format_spec(info_dict, download=download)
2280 self.write_debug('Default format spec: %s' % req_format)
2281 format_selector = self.build_format_selector(req_format)
2282
2283 # While in format selection we may need to have an access to the original
2284 # format set in order to calculate some metrics or do some processing.
2285 # For now we need to be able to guess whether original formats provided
2286 # by extractor are incomplete or not (i.e. whether extractor provides only
2287 # video-only or audio-only formats) for proper formats selection for
2288 # extractors with such incomplete formats (see
2289 # https://github.com/ytdl-org/youtube-dl/pull/5556).
2290 # Since formats may be filtered during format selection and may not match
2291 # the original formats the results may be incorrect. Thus original formats
2292 # or pre-calculated metrics should be passed to format selection routines
2293 # as well.
2294 # We will pass a context object containing all necessary additional data
2295 # instead of just formats.
2296 # This fixes incorrect format selection issue (see
2297 # https://github.com/ytdl-org/youtube-dl/issues/10083).
2298 incomplete_formats = (
2299 # All formats are video-only or
2300 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2301 # all formats are audio-only
2302 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2303
2304 ctx = {
2305 'formats': formats,
2306 'incomplete_formats': incomplete_formats,
2307 }
2308
2309 formats_to_download = list(format_selector(ctx))
2310 if not formats_to_download:
2311 if not self.params.get('ignore_no_formats_error'):
2312 raise ExtractorError('Requested format is not available', expected=True,
2313 video_id=info_dict['id'], ie=info_dict['extractor'])
2314 else:
2315 self.report_warning('Requested format is not available')
2316 # Process what we can, even without any available formats.
2317 self.process_info(dict(info_dict))
2318 elif download:
2319 self.to_screen(
2320 '[info] %s: Downloading %d format(s): %s' % (
2321 info_dict['id'], len(formats_to_download),
2322 ", ".join([f['format_id'] for f in formats_to_download])))
2323 for fmt in formats_to_download:
2324 new_info = dict(info_dict)
2325 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2326 new_info['__original_infodict'] = info_dict
2327 new_info.update(fmt)
2328 self.process_info(new_info)
2329 # We update the info dict with the best quality format (backwards compatibility)
2330 if formats_to_download:
2331 info_dict.update(formats_to_download[-1])
2332 return info_dict
2333
2334 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2335 """Select the requested subtitles and their format"""
2336 available_subs = {}
2337 if normal_subtitles and self.params.get('writesubtitles'):
2338 available_subs.update(normal_subtitles)
2339 if automatic_captions and self.params.get('writeautomaticsub'):
2340 for lang, cap_info in automatic_captions.items():
2341 if lang not in available_subs:
2342 available_subs[lang] = cap_info
2343
2344 if (not self.params.get('writesubtitles') and not
2345 self.params.get('writeautomaticsub') or not
2346 available_subs):
2347 return None
2348
2349 all_sub_langs = available_subs.keys()
2350 if self.params.get('allsubtitles', False):
2351 requested_langs = all_sub_langs
2352 elif self.params.get('subtitleslangs', False):
2353 requested_langs = set()
2354 for lang in self.params.get('subtitleslangs'):
2355 if lang == 'all':
2356 requested_langs.update(all_sub_langs)
2357 continue
2358 discard = lang[0] == '-'
2359 if discard:
2360 lang = lang[1:]
2361 current_langs = filter(re.compile(lang + '$').match, all_sub_langs)
2362 if discard:
2363 for lang in current_langs:
2364 requested_langs.discard(lang)
2365 else:
2366 requested_langs.update(current_langs)
2367 elif 'en' in available_subs:
2368 requested_langs = ['en']
2369 else:
2370 requested_langs = [list(all_sub_langs)[0]]
2371 if requested_langs:
2372 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2373
2374 formats_query = self.params.get('subtitlesformat', 'best')
2375 formats_preference = formats_query.split('/') if formats_query else []
2376 subs = {}
2377 for lang in requested_langs:
2378 formats = available_subs.get(lang)
2379 if formats is None:
2380 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2381 continue
2382 for ext in formats_preference:
2383 if ext == 'best':
2384 f = formats[-1]
2385 break
2386 matches = list(filter(lambda f: f['ext'] == ext, formats))
2387 if matches:
2388 f = matches[-1]
2389 break
2390 else:
2391 f = formats[-1]
2392 self.report_warning(
2393 'No subtitle format found matching "%s" for language %s, '
2394 'using %s' % (formats_query, lang, f['ext']))
2395 subs[lang] = f
2396 return subs
2397
2398 def __forced_printings(self, info_dict, filename, incomplete):
2399 def print_mandatory(field, actual_field=None):
2400 if actual_field is None:
2401 actual_field = field
2402 if (self.params.get('force%s' % field, False)
2403 and (not incomplete or info_dict.get(actual_field) is not None)):
2404 self.to_stdout(info_dict[actual_field])
2405
2406 def print_optional(field):
2407 if (self.params.get('force%s' % field, False)
2408 and info_dict.get(field) is not None):
2409 self.to_stdout(info_dict[field])
2410
2411 info_dict = info_dict.copy()
2412 if filename is not None:
2413 info_dict['filename'] = filename
2414 if info_dict.get('requested_formats') is not None:
2415 # For RTMP URLs, also include the playpath
2416 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2417 elif 'url' in info_dict:
2418 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2419
2420 if self.params.get('forceprint') or self.params.get('forcejson'):
2421 self.post_extract(info_dict)
2422 for tmpl in self.params.get('forceprint', []):
2423 if re.match(r'\w+$', tmpl):
2424 tmpl = '%({})s'.format(tmpl)
2425 tmpl, info_copy = self.prepare_outtmpl(tmpl, info_dict)
2426 self.to_stdout(self.escape_outtmpl(tmpl) % info_copy)
2427
2428 print_mandatory('title')
2429 print_mandatory('id')
2430 print_mandatory('url', 'urls')
2431 print_optional('thumbnail')
2432 print_optional('description')
2433 print_optional('filename')
2434 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2435 self.to_stdout(formatSeconds(info_dict['duration']))
2436 print_mandatory('format')
2437
2438 if self.params.get('forcejson'):
2439 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2440
2441 def dl(self, name, info, subtitle=False, test=False):
2442 if not info.get('url'):
2443 self.raise_no_formats(info, True)
2444
2445 if test:
2446 verbose = self.params.get('verbose')
2447 params = {
2448 'test': True,
2449 'quiet': not verbose,
2450 'verbose': verbose,
2451 'noprogress': not verbose,
2452 'nopart': True,
2453 'skip_unavailable_fragments': False,
2454 'keep_fragments': False,
2455 'overwrites': True,
2456 '_no_ytdl_file': True,
2457 }
2458 else:
2459 params = self.params
2460 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2461 if not test:
2462 for ph in self._progress_hooks:
2463 fd.add_progress_hook(ph)
2464 urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2465 self.write_debug('Invoking downloader on "%s"' % urls)
2466 new_info = dict(info)
2467 if new_info.get('http_headers') is None:
2468 new_info['http_headers'] = self._calc_headers(new_info)
2469 return fd.download(name, new_info, subtitle)
2470
2471 def process_info(self, info_dict):
2472 """Process a single resolved IE result."""
2473
2474 assert info_dict.get('_type', 'video') == 'video'
2475
2476 max_downloads = self.params.get('max_downloads')
2477 if max_downloads is not None:
2478 if self._num_downloads >= int(max_downloads):
2479 raise MaxDownloadsReached()
2480
2481 # TODO: backward compatibility, to be removed
2482 info_dict['fulltitle'] = info_dict['title']
2483
2484 if 'format' not in info_dict and 'ext' in info_dict:
2485 info_dict['format'] = info_dict['ext']
2486
2487 if self._match_entry(info_dict) is not None:
2488 return
2489
2490 self.post_extract(info_dict)
2491 self._num_downloads += 1
2492
2493 # info_dict['_filename'] needs to be set for backward compatibility
2494 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2495 temp_filename = self.prepare_filename(info_dict, 'temp')
2496 files_to_move = {}
2497
2498 # Forced printings
2499 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2500
2501 if self.params.get('simulate'):
2502 if self.params.get('force_write_download_archive', False):
2503 self.record_download_archive(info_dict)
2504
2505 # Do nothing else if in simulate mode
2506 return
2507
2508 if full_filename is None:
2509 return
2510
2511 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2512 return
2513 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2514 return
2515
2516 if self.params.get('writedescription', False):
2517 descfn = self.prepare_filename(info_dict, 'description')
2518 if not self._ensure_dir_exists(encodeFilename(descfn)):
2519 return
2520 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
2521 self.to_screen('[info] Video description is already present')
2522 elif info_dict.get('description') is None:
2523 self.report_warning('There\'s no description to write.')
2524 else:
2525 try:
2526 self.to_screen('[info] Writing video description to: ' + descfn)
2527 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
2528 descfile.write(info_dict['description'])
2529 except (OSError, IOError):
2530 self.report_error('Cannot write description file ' + descfn)
2531 return
2532
2533 if self.params.get('writeannotations', False):
2534 annofn = self.prepare_filename(info_dict, 'annotation')
2535 if not self._ensure_dir_exists(encodeFilename(annofn)):
2536 return
2537 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2538 self.to_screen('[info] Video annotations are already present')
2539 elif not info_dict.get('annotations'):
2540 self.report_warning('There are no annotations to write.')
2541 else:
2542 try:
2543 self.to_screen('[info] Writing video annotations to: ' + annofn)
2544 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2545 annofile.write(info_dict['annotations'])
2546 except (KeyError, TypeError):
2547 self.report_warning('There are no annotations to write.')
2548 except (OSError, IOError):
2549 self.report_error('Cannot write annotations file: ' + annofn)
2550 return
2551
2552 subtitles_are_requested = any([self.params.get('writesubtitles', False),
2553 self.params.get('writeautomaticsub')])
2554
2555 if subtitles_are_requested and info_dict.get('requested_subtitles'):
2556 # subtitles download errors are already managed as troubles in relevant IE
2557 # that way it will silently go on when used with unsupporting IE
2558 subtitles = info_dict['requested_subtitles']
2559 # ie = self.get_info_extractor(info_dict['extractor_key'])
2560 for sub_lang, sub_info in subtitles.items():
2561 sub_format = sub_info['ext']
2562 sub_filename = subtitles_filename(temp_filename, sub_lang, sub_format, info_dict.get('ext'))
2563 sub_filename_final = subtitles_filename(
2564 self.prepare_filename(info_dict, 'subtitle'), sub_lang, sub_format, info_dict.get('ext'))
2565 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(sub_filename)):
2566 self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
2567 sub_info['filepath'] = sub_filename
2568 files_to_move[sub_filename] = sub_filename_final
2569 else:
2570 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
2571 if sub_info.get('data') is not None:
2572 try:
2573 # Use newline='' to prevent conversion of newline characters
2574 # See https://github.com/ytdl-org/youtube-dl/issues/10268
2575 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
2576 subfile.write(sub_info['data'])
2577 sub_info['filepath'] = sub_filename
2578 files_to_move[sub_filename] = sub_filename_final
2579 except (OSError, IOError):
2580 self.report_error('Cannot write subtitles file ' + sub_filename)
2581 return
2582 else:
2583 try:
2584 self.dl(sub_filename, sub_info.copy(), subtitle=True)
2585 sub_info['filepath'] = sub_filename
2586 files_to_move[sub_filename] = sub_filename_final
2587 except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
2588 self.report_warning('Unable to download subtitle for "%s": %s' %
2589 (sub_lang, error_to_compat_str(err)))
2590 continue
2591
2592 if self.params.get('writeinfojson', False):
2593 infofn = self.prepare_filename(info_dict, 'infojson')
2594 if not self._ensure_dir_exists(encodeFilename(infofn)):
2595 return
2596 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
2597 self.to_screen('[info] Video metadata is already present')
2598 else:
2599 self.to_screen('[info] Writing video metadata as JSON to: ' + infofn)
2600 try:
2601 write_json_file(self.sanitize_info(info_dict, self.params.get('clean_infojson', True)), infofn)
2602 except (OSError, IOError):
2603 self.report_error('Cannot write video metadata to JSON file ' + infofn)
2604 return
2605 info_dict['__infojson_filename'] = infofn
2606
2607 for thumb_ext in self._write_thumbnails(info_dict, temp_filename):
2608 thumb_filename_temp = replace_extension(temp_filename, thumb_ext, info_dict.get('ext'))
2609 thumb_filename = replace_extension(
2610 self.prepare_filename(info_dict, 'thumbnail'), thumb_ext, info_dict.get('ext'))
2611 files_to_move[thumb_filename_temp] = thumb_filename
2612
2613 # Write internet shortcut files
2614 url_link = webloc_link = desktop_link = False
2615 if self.params.get('writelink', False):
2616 if sys.platform == "darwin": # macOS.
2617 webloc_link = True
2618 elif sys.platform.startswith("linux"):
2619 desktop_link = True
2620 else: # if sys.platform in ['win32', 'cygwin']:
2621 url_link = True
2622 if self.params.get('writeurllink', False):
2623 url_link = True
2624 if self.params.get('writewebloclink', False):
2625 webloc_link = True
2626 if self.params.get('writedesktoplink', False):
2627 desktop_link = True
2628
2629 if url_link or webloc_link or desktop_link:
2630 if 'webpage_url' not in info_dict:
2631 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2632 return
2633 ascii_url = iri_to_uri(info_dict['webpage_url'])
2634
2635 def _write_link_file(extension, template, newline, embed_filename):
2636 linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
2637 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2638 self.to_screen('[info] Internet shortcut is already present')
2639 else:
2640 try:
2641 self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
2642 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
2643 template_vars = {'url': ascii_url}
2644 if embed_filename:
2645 template_vars['filename'] = linkfn[:-(len(extension) + 1)]
2646 linkfile.write(template % template_vars)
2647 except (OSError, IOError):
2648 self.report_error('Cannot write internet shortcut ' + linkfn)
2649 return False
2650 return True
2651
2652 if url_link:
2653 if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
2654 return
2655 if webloc_link:
2656 if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
2657 return
2658 if desktop_link:
2659 if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
2660 return
2661
2662 try:
2663 info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2664 except PostProcessingError as err:
2665 self.report_error('Preprocessing: %s' % str(err))
2666 return
2667
2668 must_record_download_archive = False
2669 if self.params.get('skip_download', False):
2670 info_dict['filepath'] = temp_filename
2671 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2672 info_dict['__files_to_move'] = files_to_move
2673 info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2674 else:
2675 # Download
2676 info_dict.setdefault('__postprocessors', [])
2677 try:
2678
2679 def existing_file(*filepaths):
2680 ext = info_dict.get('ext')
2681 final_ext = self.params.get('final_ext', ext)
2682 existing_files = []
2683 for file in orderedSet(filepaths):
2684 if final_ext != ext:
2685 converted = replace_extension(file, final_ext, ext)
2686 if os.path.exists(encodeFilename(converted)):
2687 existing_files.append(converted)
2688 if os.path.exists(encodeFilename(file)):
2689 existing_files.append(file)
2690
2691 if not existing_files or self.params.get('overwrites', False):
2692 for file in orderedSet(existing_files):
2693 self.report_file_delete(file)
2694 os.remove(encodeFilename(file))
2695 return None
2696
2697 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2698 return existing_files[0]
2699
2700 success = True
2701 if info_dict.get('requested_formats') is not None:
2702
2703 def compatible_formats(formats):
2704 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2705 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2706 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2707 if len(video_formats) > 2 or len(audio_formats) > 2:
2708 return False
2709
2710 # Check extension
2711 exts = set(format.get('ext') for format in formats)
2712 COMPATIBLE_EXTS = (
2713 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2714 set(('webm',)),
2715 )
2716 for ext_sets in COMPATIBLE_EXTS:
2717 if ext_sets.issuperset(exts):
2718 return True
2719 # TODO: Check acodec/vcodec
2720 return False
2721
2722 requested_formats = info_dict['requested_formats']
2723 old_ext = info_dict['ext']
2724 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2725 info_dict['ext'] = 'mkv'
2726 self.report_warning(
2727 'Requested formats are incompatible for merge and will be merged into mkv.')
2728 new_ext = info_dict['ext']
2729
2730 def correct_ext(filename, ext=new_ext):
2731 if filename == '-':
2732 return filename
2733 filename_real_ext = os.path.splitext(filename)[1][1:]
2734 filename_wo_ext = (
2735 os.path.splitext(filename)[0]
2736 if filename_real_ext in (old_ext, new_ext)
2737 else filename)
2738 return '%s.%s' % (filename_wo_ext, ext)
2739
2740 # Ensure filename always has a correct extension for successful merge
2741 full_filename = correct_ext(full_filename)
2742 temp_filename = correct_ext(temp_filename)
2743 dl_filename = existing_file(full_filename, temp_filename)
2744 info_dict['__real_download'] = False
2745
2746 _protocols = set(determine_protocol(f) for f in requested_formats)
2747 if len(_protocols) == 1: # All requested formats have same protocol
2748 info_dict['protocol'] = _protocols.pop()
2749 directly_mergable = FFmpegFD.can_merge_formats(info_dict)
2750 if dl_filename is not None:
2751 self.report_file_already_downloaded(dl_filename)
2752 elif (directly_mergable and get_suitable_downloader(
2753 info_dict, self.params, to_stdout=(temp_filename == '-')) == FFmpegFD):
2754 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2755 success, real_download = self.dl(temp_filename, info_dict)
2756 info_dict['__real_download'] = real_download
2757 else:
2758 downloaded = []
2759 merger = FFmpegMergerPP(self)
2760 if self.params.get('allow_unplayable_formats'):
2761 self.report_warning(
2762 'You have requested merging of multiple formats '
2763 'while also allowing unplayable formats to be downloaded. '
2764 'The formats won\'t be merged to prevent data corruption.')
2765 elif not merger.available:
2766 self.report_warning(
2767 'You have requested merging of multiple formats but ffmpeg is not installed. '
2768 'The formats won\'t be merged.')
2769
2770 if temp_filename == '-':
2771 reason = ('using a downloader other than ffmpeg' if directly_mergable
2772 else 'but the formats are incompatible for simultaneous download' if merger.available
2773 else 'but ffmpeg is not installed')
2774 self.report_warning(
2775 f'You have requested downloading multiple formats to stdout {reason}. '
2776 'The formats will be streamed one after the other')
2777 fname = temp_filename
2778 for f in requested_formats:
2779 new_info = dict(info_dict)
2780 del new_info['requested_formats']
2781 new_info.update(f)
2782 if temp_filename != '-':
2783 fname = prepend_extension(
2784 correct_ext(temp_filename, new_info['ext']),
2785 'f%s' % f['format_id'], new_info['ext'])
2786 if not self._ensure_dir_exists(fname):
2787 return
2788 downloaded.append(fname)
2789 partial_success, real_download = self.dl(fname, new_info)
2790 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2791 success = success and partial_success
2792 if merger.available and not self.params.get('allow_unplayable_formats'):
2793 info_dict['__postprocessors'].append(merger)
2794 info_dict['__files_to_merge'] = downloaded
2795 # Even if there were no downloads, it is being merged only now
2796 info_dict['__real_download'] = True
2797 else:
2798 for file in downloaded:
2799 files_to_move[file] = None
2800 else:
2801 # Just a single file
2802 dl_filename = existing_file(full_filename, temp_filename)
2803 if dl_filename is None or dl_filename == temp_filename:
2804 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
2805 # So we should try to resume the download
2806 success, real_download = self.dl(temp_filename, info_dict)
2807 info_dict['__real_download'] = real_download
2808 else:
2809 self.report_file_already_downloaded(dl_filename)
2810
2811 dl_filename = dl_filename or temp_filename
2812 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2813
2814 except network_exceptions as err:
2815 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2816 return
2817 except (OSError, IOError) as err:
2818 raise UnavailableVideoError(err)
2819 except (ContentTooShortError, ) as err:
2820 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2821 return
2822
2823 if success and full_filename != '-':
2824
2825 def fixup():
2826 do_fixup = True
2827 fixup_policy = self.params.get('fixup')
2828 vid = info_dict['id']
2829
2830 if fixup_policy in ('ignore', 'never'):
2831 return
2832 elif fixup_policy == 'warn':
2833 do_fixup = False
2834 elif fixup_policy != 'force':
2835 assert fixup_policy in ('detect_or_warn', None)
2836 if not info_dict.get('__real_download'):
2837 do_fixup = False
2838
2839 def ffmpeg_fixup(cndn, msg, cls):
2840 if not cndn:
2841 return
2842 if not do_fixup:
2843 self.report_warning(f'{vid}: {msg}')
2844 return
2845 pp = cls(self)
2846 if pp.available:
2847 info_dict['__postprocessors'].append(pp)
2848 else:
2849 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
2850
2851 stretched_ratio = info_dict.get('stretched_ratio')
2852 ffmpeg_fixup(
2853 stretched_ratio not in (1, None),
2854 f'Non-uniform pixel ratio {stretched_ratio}',
2855 FFmpegFixupStretchedPP)
2856
2857 ffmpeg_fixup(
2858 (info_dict.get('requested_formats') is None
2859 and info_dict.get('container') == 'm4a_dash'
2860 and info_dict.get('ext') == 'm4a'),
2861 'writing DASH m4a. Only some players support this container',
2862 FFmpegFixupM4aPP)
2863
2864 downloader = (get_suitable_downloader(info_dict, self.params).__name__
2865 if 'protocol' in info_dict else None)
2866 ffmpeg_fixup(downloader == 'HlsFD', 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
2867 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
2868 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
2869
2870 fixup()
2871 try:
2872 info_dict = self.post_process(dl_filename, info_dict, files_to_move)
2873 except PostProcessingError as err:
2874 self.report_error('Postprocessing: %s' % str(err))
2875 return
2876 try:
2877 for ph in self._post_hooks:
2878 ph(info_dict['filepath'])
2879 except Exception as err:
2880 self.report_error('post hooks: %s' % str(err))
2881 return
2882 must_record_download_archive = True
2883
2884 if must_record_download_archive or self.params.get('force_write_download_archive', False):
2885 self.record_download_archive(info_dict)
2886 max_downloads = self.params.get('max_downloads')
2887 if max_downloads is not None and self._num_downloads >= int(max_downloads):
2888 raise MaxDownloadsReached()
2889
2890 def download(self, url_list):
2891 """Download a given list of URLs."""
2892 outtmpl = self.outtmpl_dict['default']
2893 if (len(url_list) > 1
2894 and outtmpl != '-'
2895 and '%' not in outtmpl
2896 and self.params.get('max_downloads') != 1):
2897 raise SameFileError(outtmpl)
2898
2899 for url in url_list:
2900 try:
2901 # It also downloads the videos
2902 res = self.extract_info(
2903 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2904 except UnavailableVideoError:
2905 self.report_error('unable to download video')
2906 except MaxDownloadsReached:
2907 self.to_screen('[info] Maximum number of downloads reached')
2908 raise
2909 except ExistingVideoReached:
2910 self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing')
2911 raise
2912 except RejectedVideoReached:
2913 self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject')
2914 raise
2915 else:
2916 if self.params.get('dump_single_json', False):
2917 self.post_extract(res)
2918 self.to_stdout(json.dumps(self.sanitize_info(res)))
2919
2920 return self._download_retcode
2921
2922 def download_with_info_file(self, info_filename):
2923 with contextlib.closing(fileinput.FileInput(
2924 [info_filename], mode='r',
2925 openhook=fileinput.hook_encoded('utf-8'))) as f:
2926 # FileInput doesn't have a read method, we can't call json.load
2927 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
2928 try:
2929 self.process_ie_result(info, download=True)
2930 except (DownloadError, EntryNotInPlaylist, ThrottledDownload):
2931 webpage_url = info.get('webpage_url')
2932 if webpage_url is not None:
2933 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2934 return self.download([webpage_url])
2935 else:
2936 raise
2937 return self._download_retcode
2938
2939 @staticmethod
2940 def sanitize_info(info_dict, remove_private_keys=False):
2941 ''' Sanitize the infodict for converting to json '''
2942 if info_dict is None:
2943 return info_dict
2944 info_dict.setdefault('epoch', int(time.time()))
2945 remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
2946 keep_keys = ['_type'], # Always keep this to facilitate load-info-json
2947 if remove_private_keys:
2948 remove_keys |= {
2949 'requested_formats', 'requested_subtitles', 'requested_entries',
2950 'filepath', 'entries', 'original_url', 'playlist_autonumber',
2951 }
2952 empty_values = (None, {}, [], set(), tuple())
2953 reject = lambda k, v: k not in keep_keys and (
2954 k.startswith('_') or k in remove_keys or v in empty_values)
2955 else:
2956 reject = lambda k, v: k in remove_keys
2957 filter_fn = lambda obj: (
2958 list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
2959 else obj if not isinstance(obj, dict)
2960 else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
2961 return filter_fn(info_dict)
2962
2963 @staticmethod
2964 def filter_requested_info(info_dict, actually_filter=True):
2965 ''' Alias of sanitize_info for backward compatibility '''
2966 return YoutubeDL.sanitize_info(info_dict, actually_filter)
2967
2968 def run_pp(self, pp, infodict):
2969 files_to_delete = []
2970 if '__files_to_move' not in infodict:
2971 infodict['__files_to_move'] = {}
2972 files_to_delete, infodict = pp.run(infodict)
2973 if not files_to_delete:
2974 return infodict
2975
2976 if self.params.get('keepvideo', False):
2977 for f in files_to_delete:
2978 infodict['__files_to_move'].setdefault(f, '')
2979 else:
2980 for old_filename in set(files_to_delete):
2981 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2982 try:
2983 os.remove(encodeFilename(old_filename))
2984 except (IOError, OSError):
2985 self.report_warning('Unable to remove downloaded original file')
2986 if old_filename in infodict['__files_to_move']:
2987 del infodict['__files_to_move'][old_filename]
2988 return infodict
2989
2990 @staticmethod
2991 def post_extract(info_dict):
2992 def actual_post_extract(info_dict):
2993 if info_dict.get('_type') in ('playlist', 'multi_video'):
2994 for video_dict in info_dict.get('entries', {}):
2995 actual_post_extract(video_dict or {})
2996 return
2997
2998 post_extractor = info_dict.get('__post_extractor') or (lambda: {})
2999 extra = post_extractor().items()
3000 info_dict.update(extra)
3001 info_dict.pop('__post_extractor', None)
3002
3003 original_infodict = info_dict.get('__original_infodict') or {}
3004 original_infodict.update(extra)
3005 original_infodict.pop('__post_extractor', None)
3006
3007 actual_post_extract(info_dict or {})
3008
3009 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3010 info = dict(ie_info)
3011 info['__files_to_move'] = files_to_move or {}
3012 for pp in self._pps[key]:
3013 info = self.run_pp(pp, info)
3014 return info, info.pop('__files_to_move', None)
3015
3016 def post_process(self, filename, ie_info, files_to_move=None):
3017 """Run all the postprocessors on the given file."""
3018 info = dict(ie_info)
3019 info['filepath'] = filename
3020 info['__files_to_move'] = files_to_move or {}
3021
3022 for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
3023 info = self.run_pp(pp, info)
3024 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3025 del info['__files_to_move']
3026 for pp in self._pps['after_move']:
3027 info = self.run_pp(pp, info)
3028 return info
3029
3030 def _make_archive_id(self, info_dict):
3031 video_id = info_dict.get('id')
3032 if not video_id:
3033 return
3034 # Future-proof against any change in case
3035 # and backwards compatibility with prior versions
3036 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3037 if extractor is None:
3038 url = str_or_none(info_dict.get('url'))
3039 if not url:
3040 return
3041 # Try to find matching extractor for the URL and take its ie_key
3042 for ie_key, ie in self._ies.items():
3043 if ie.suitable(url):
3044 extractor = ie_key
3045 break
3046 else:
3047 return
3048 return '%s %s' % (extractor.lower(), video_id)
3049
3050 def in_download_archive(self, info_dict):
3051 fn = self.params.get('download_archive')
3052 if fn is None:
3053 return False
3054
3055 vid_id = self._make_archive_id(info_dict)
3056 if not vid_id:
3057 return False # Incomplete video information
3058
3059 return vid_id in self.archive
3060
3061 def record_download_archive(self, info_dict):
3062 fn = self.params.get('download_archive')
3063 if fn is None:
3064 return
3065 vid_id = self._make_archive_id(info_dict)
3066 assert vid_id
3067 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3068 archive_file.write(vid_id + '\n')
3069 self.archive.add(vid_id)
3070
3071 @staticmethod
3072 def format_resolution(format, default='unknown'):
3073 if format.get('vcodec') == 'none':
3074 if format.get('acodec') == 'none':
3075 return 'images'
3076 return 'audio only'
3077 if format.get('resolution') is not None:
3078 return format['resolution']
3079 if format.get('width') and format.get('height'):
3080 res = '%dx%d' % (format['width'], format['height'])
3081 elif format.get('height'):
3082 res = '%sp' % format['height']
3083 elif format.get('width'):
3084 res = '%dx?' % format['width']
3085 else:
3086 res = default
3087 return res
3088
3089 def _format_note(self, fdict):
3090 res = ''
3091 if fdict.get('ext') in ['f4f', 'f4m']:
3092 res += '(unsupported) '
3093 if fdict.get('language'):
3094 if res:
3095 res += ' '
3096 res += '[%s] ' % fdict['language']
3097 if fdict.get('format_note') is not None:
3098 res += fdict['format_note'] + ' '
3099 if fdict.get('tbr') is not None:
3100 res += '%4dk ' % fdict['tbr']
3101 if fdict.get('container') is not None:
3102 if res:
3103 res += ', '
3104 res += '%s container' % fdict['container']
3105 if (fdict.get('vcodec') is not None
3106 and fdict.get('vcodec') != 'none'):
3107 if res:
3108 res += ', '
3109 res += fdict['vcodec']
3110 if fdict.get('vbr') is not None:
3111 res += '@'
3112 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3113 res += 'video@'
3114 if fdict.get('vbr') is not None:
3115 res += '%4dk' % fdict['vbr']
3116 if fdict.get('fps') is not None:
3117 if res:
3118 res += ', '
3119 res += '%sfps' % fdict['fps']
3120 if fdict.get('acodec') is not None:
3121 if res:
3122 res += ', '
3123 if fdict['acodec'] == 'none':
3124 res += 'video only'
3125 else:
3126 res += '%-5s' % fdict['acodec']
3127 elif fdict.get('abr') is not None:
3128 if res:
3129 res += ', '
3130 res += 'audio'
3131 if fdict.get('abr') is not None:
3132 res += '@%3dk' % fdict['abr']
3133 if fdict.get('asr') is not None:
3134 res += ' (%5dHz)' % fdict['asr']
3135 if fdict.get('filesize') is not None:
3136 if res:
3137 res += ', '
3138 res += format_bytes(fdict['filesize'])
3139 elif fdict.get('filesize_approx') is not None:
3140 if res:
3141 res += ', '
3142 res += '~' + format_bytes(fdict['filesize_approx'])
3143 return res
3144
3145 def list_formats(self, info_dict):
3146 formats = info_dict.get('formats', [info_dict])
3147 new_format = (
3148 'list-formats' not in self.params.get('compat_opts', [])
3149 and self.params.get('listformats_table', True) is not False)
3150 if new_format:
3151 table = [
3152 [
3153 format_field(f, 'format_id'),
3154 format_field(f, 'ext'),
3155 self.format_resolution(f),
3156 format_field(f, 'fps', '%d'),
3157 '|',
3158 format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
3159 format_field(f, 'tbr', '%4dk'),
3160 shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
3161 '|',
3162 format_field(f, 'vcodec', default='unknown').replace('none', ''),
3163 format_field(f, 'vbr', '%4dk'),
3164 format_field(f, 'acodec', default='unknown').replace('none', ''),
3165 format_field(f, 'abr', '%3dk'),
3166 format_field(f, 'asr', '%5dHz'),
3167 ', '.join(filter(None, (
3168 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
3169 format_field(f, 'language', '[%s]'),
3170 format_field(f, 'format_note'),
3171 format_field(f, 'container', ignore=(None, f.get('ext'))),
3172 ))),
3173 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3174 header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO',
3175 '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO']
3176 else:
3177 table = [
3178 [
3179 format_field(f, 'format_id'),
3180 format_field(f, 'ext'),
3181 self.format_resolution(f),
3182 self._format_note(f)]
3183 for f in formats
3184 if f.get('preference') is None or f['preference'] >= -1000]
3185 header_line = ['format code', 'extension', 'resolution', 'note']
3186
3187 self.to_screen(
3188 '[info] Available formats for %s:' % info_dict['id'])
3189 self.to_stdout(render_table(
3190 header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format))
3191
3192 def list_thumbnails(self, info_dict):
3193 thumbnails = list(info_dict.get('thumbnails'))
3194 if not thumbnails:
3195 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3196 return
3197
3198 self.to_screen(
3199 '[info] Thumbnails for %s:' % info_dict['id'])
3200 self.to_stdout(render_table(
3201 ['ID', 'width', 'height', 'URL'],
3202 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3203
3204 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3205 if not subtitles:
3206 self.to_screen('%s has no %s' % (video_id, name))
3207 return
3208 self.to_screen(
3209 'Available %s for %s:' % (name, video_id))
3210
3211 def _row(lang, formats):
3212 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3213 if len(set(names)) == 1:
3214 names = [] if names[0] == 'unknown' else names[:1]
3215 return [lang, ', '.join(names), ', '.join(exts)]
3216
3217 self.to_stdout(render_table(
3218 ['Language', 'Name', 'Formats'],
3219 [_row(lang, formats) for lang, formats in subtitles.items()],
3220 hideEmpty=True))
3221
3222 def urlopen(self, req):
3223 """ Start an HTTP download """
3224 if isinstance(req, compat_basestring):
3225 req = sanitized_Request(req)
3226 return self._opener.open(req, timeout=self._socket_timeout)
3227
3228 def print_debug_header(self):
3229 if not self.params.get('verbose'):
3230 return
3231
3232 stdout_encoding = getattr(
3233 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
3234 encoding_str = (
3235 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
3236 locale.getpreferredencoding(),
3237 sys.getfilesystemencoding(),
3238 stdout_encoding,
3239 self.get_encoding()))
3240 write_string(encoding_str, encoding=None)
3241
3242 source = (
3243 '(exe)' if hasattr(sys, 'frozen')
3244 else '(zip)' if isinstance(globals().get('__loader__'), zipimporter)
3245 else '(source)' if os.path.basename(sys.argv[0]) == '__main__.py'
3246 else '')
3247 self._write_string('[debug] yt-dlp version %s %s\n' % (__version__, source))
3248 if _LAZY_LOADER:
3249 self._write_string('[debug] Lazy loading extractors enabled\n')
3250 if _PLUGIN_CLASSES:
3251 self._write_string(
3252 '[debug] Plugin Extractors: %s\n' % [ie.ie_key() for ie in _PLUGIN_CLASSES])
3253 if self.params.get('compat_opts'):
3254 self._write_string(
3255 '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts')))
3256 try:
3257 sp = subprocess.Popen(
3258 ['git', 'rev-parse', '--short', 'HEAD'],
3259 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3260 cwd=os.path.dirname(os.path.abspath(__file__)))
3261 out, err = process_communicate_or_kill(sp)
3262 out = out.decode().strip()
3263 if re.match('[0-9a-f]+', out):
3264 self._write_string('[debug] Git HEAD: %s\n' % out)
3265 except Exception:
3266 try:
3267 sys.exc_clear()
3268 except Exception:
3269 pass
3270
3271 def python_implementation():
3272 impl_name = platform.python_implementation()
3273 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3274 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3275 return impl_name
3276
3277 self._write_string('[debug] Python version %s (%s %s) - %s\n' % (
3278 platform.python_version(),
3279 python_implementation(),
3280 platform.architecture()[0],
3281 platform_name()))
3282
3283 exe_versions = FFmpegPostProcessor.get_versions(self)
3284 exe_versions['rtmpdump'] = rtmpdump_version()
3285 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3286 exe_str = ', '.join(
3287 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3288 ) or 'none'
3289 self._write_string('[debug] exe versions: %s\n' % exe_str)
3290
3291 from .downloader.fragment import can_decrypt_frag
3292 from .downloader.websocket import has_websockets
3293 from .postprocessor.embedthumbnail import has_mutagen
3294 from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
3295
3296 lib_str = ', '.join(sorted(filter(None, (
3297 can_decrypt_frag and 'pycryptodome',
3298 has_websockets and 'websockets',
3299 has_mutagen and 'mutagen',
3300 SQLITE_AVAILABLE and 'sqlite',
3301 KEYRING_AVAILABLE and 'keyring',
3302 )))) or 'none'
3303 self._write_string('[debug] Optional libraries: %s\n' % lib_str)
3304
3305 proxy_map = {}
3306 for handler in self._opener.handlers:
3307 if hasattr(handler, 'proxies'):
3308 proxy_map.update(handler.proxies)
3309 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
3310
3311 if self.params.get('call_home', False):
3312 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3313 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
3314 return
3315 latest_version = self.urlopen(
3316 'https://yt-dl.org/latest/version').read().decode('utf-8')
3317 if version_tuple(latest_version) > version_tuple(__version__):
3318 self.report_warning(
3319 'You are using an outdated version (newest version: %s)! '
3320 'See https://yt-dl.org/update if you need help updating.' %
3321 latest_version)
3322
3323 def _setup_opener(self):
3324 timeout_val = self.params.get('socket_timeout')
3325 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
3326
3327 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3328 opts_cookiefile = self.params.get('cookiefile')
3329 opts_proxy = self.params.get('proxy')
3330
3331 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3332
3333 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3334 if opts_proxy is not None:
3335 if opts_proxy == '':
3336 proxies = {}
3337 else:
3338 proxies = {'http': opts_proxy, 'https': opts_proxy}
3339 else:
3340 proxies = compat_urllib_request.getproxies()
3341 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3342 if 'http' in proxies and 'https' not in proxies:
3343 proxies['https'] = proxies['http']
3344 proxy_handler = PerRequestProxyHandler(proxies)
3345
3346 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3347 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3348 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3349 redirect_handler = YoutubeDLRedirectHandler()
3350 data_handler = compat_urllib_request_DataHandler()
3351
3352 # When passing our own FileHandler instance, build_opener won't add the
3353 # default FileHandler and allows us to disable the file protocol, which
3354 # can be used for malicious purposes (see
3355 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3356 file_handler = compat_urllib_request.FileHandler()
3357
3358 def file_open(*args, **kwargs):
3359 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3360 file_handler.file_open = file_open
3361
3362 opener = compat_urllib_request.build_opener(
3363 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3364
3365 # Delete the default user-agent header, which would otherwise apply in
3366 # cases where our custom HTTP handler doesn't come into play
3367 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3368 opener.addheaders = []
3369 self._opener = opener
3370
3371 def encode(self, s):
3372 if isinstance(s, bytes):
3373 return s # Already encoded
3374
3375 try:
3376 return s.encode(self.get_encoding())
3377 except UnicodeEncodeError as err:
3378 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3379 raise
3380
3381 def get_encoding(self):
3382 encoding = self.params.get('encoding')
3383 if encoding is None:
3384 encoding = preferredencoding()
3385 return encoding
3386
3387 def _write_thumbnails(self, info_dict, filename): # return the extensions
3388 write_all = self.params.get('write_all_thumbnails', False)
3389 thumbnails = []
3390 if write_all or self.params.get('writethumbnail', False):
3391 thumbnails = info_dict.get('thumbnails') or []
3392 multiple = write_all and len(thumbnails) > 1
3393
3394 ret = []
3395 for t in thumbnails[::-1]:
3396 thumb_ext = determine_ext(t['url'], 'jpg')
3397 suffix = '%s.' % t['id'] if multiple else ''
3398 thumb_display_id = '%s ' % t['id'] if multiple else ''
3399 thumb_filename = replace_extension(filename, suffix + thumb_ext, info_dict.get('ext'))
3400
3401 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(thumb_filename)):
3402 ret.append(suffix + thumb_ext)
3403 t['filepath'] = thumb_filename
3404 self.to_screen('[%s] %s: Thumbnail %sis already present' %
3405 (info_dict['extractor'], info_dict['id'], thumb_display_id))
3406 else:
3407 self.to_screen('[%s] %s: Downloading thumbnail %s ...' %
3408 (info_dict['extractor'], info_dict['id'], thumb_display_id))
3409 try:
3410 uf = self.urlopen(t['url'])
3411 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3412 shutil.copyfileobj(uf, thumbf)
3413 ret.append(suffix + thumb_ext)
3414 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
3415 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
3416 t['filepath'] = thumb_filename
3417 except network_exceptions as err:
3418 self.report_warning('Unable to download thumbnail "%s": %s' %
3419 (t['url'], error_to_compat_str(err)))
3420 if ret and not write_all:
3421 break
3422 return ret