]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
2e150cd97983737e0d878d16b78b3281bf7c5297
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import sys
23 import tempfile
24 import time
25 import tokenize
26 import traceback
27 import random
28 import unicodedata
29
30 from string import ascii_letters
31
32 from .compat import (
33 compat_basestring,
34 compat_get_terminal_size,
35 compat_kwargs,
36 compat_numeric_types,
37 compat_os_name,
38 compat_pycrypto_AES,
39 compat_shlex_quote,
40 compat_str,
41 compat_tokenize_tokenize,
42 compat_urllib_error,
43 compat_urllib_request,
44 compat_urllib_request_DataHandler,
45 )
46 from .cookies import load_cookies
47 from .utils import (
48 age_restricted,
49 args_to_str,
50 ContentTooShortError,
51 date_from_str,
52 DateRange,
53 DEFAULT_OUTTMPL,
54 determine_ext,
55 determine_protocol,
56 DOT_DESKTOP_LINK_TEMPLATE,
57 DOT_URL_LINK_TEMPLATE,
58 DOT_WEBLOC_LINK_TEMPLATE,
59 DownloadError,
60 encode_compat_str,
61 encodeFilename,
62 EntryNotInPlaylist,
63 error_to_compat_str,
64 ExistingVideoReached,
65 expand_path,
66 ExtractorError,
67 float_or_none,
68 format_bytes,
69 format_field,
70 STR_FORMAT_RE_TMPL,
71 STR_FORMAT_TYPES,
72 formatSeconds,
73 GeoRestrictedError,
74 HEADRequest,
75 int_or_none,
76 iri_to_uri,
77 ISO3166Utils,
78 LazyList,
79 locked_file,
80 make_dir,
81 make_HTTPS_handler,
82 MaxDownloadsReached,
83 network_exceptions,
84 orderedSet,
85 OUTTMPL_TYPES,
86 PagedList,
87 parse_filesize,
88 PerRequestProxyHandler,
89 platform_name,
90 PostProcessingError,
91 preferredencoding,
92 prepend_extension,
93 process_communicate_or_kill,
94 register_socks_protocols,
95 RejectedVideoReached,
96 render_table,
97 replace_extension,
98 SameFileError,
99 sanitize_filename,
100 sanitize_path,
101 sanitize_url,
102 sanitized_Request,
103 std_headers,
104 str_or_none,
105 strftime_or_none,
106 subtitles_filename,
107 ThrottledDownload,
108 to_high_limit_path,
109 traverse_obj,
110 try_get,
111 UnavailableVideoError,
112 url_basename,
113 variadic,
114 version_tuple,
115 write_json_file,
116 write_string,
117 YoutubeDLCookieProcessor,
118 YoutubeDLHandler,
119 YoutubeDLRedirectHandler,
120 )
121 from .cache import Cache
122 from .extractor import (
123 gen_extractor_classes,
124 get_info_extractor,
125 _LAZY_LOADER,
126 _PLUGIN_CLASSES
127 )
128 from .extractor.openload import PhantomJSwrapper
129 from .downloader import (
130 FFmpegFD,
131 get_suitable_downloader,
132 shorten_protocol_name
133 )
134 from .downloader.rtmp import rtmpdump_version
135 from .postprocessor import (
136 get_postprocessor,
137 FFmpegFixupDurationPP,
138 FFmpegFixupM3u8PP,
139 FFmpegFixupM4aPP,
140 FFmpegFixupStretchedPP,
141 FFmpegFixupTimestampPP,
142 FFmpegMergerPP,
143 FFmpegPostProcessor,
144 MoveFilesAfterDownloadPP,
145 )
146 from .update import detect_variant
147 from .version import __version__
148
149 if compat_os_name == 'nt':
150 import ctypes
151
152
153 class YoutubeDL(object):
154 """YoutubeDL class.
155
156 YoutubeDL objects are the ones responsible of downloading the
157 actual video file and writing it to disk if the user has requested
158 it, among some other tasks. In most cases there should be one per
159 program. As, given a video URL, the downloader doesn't know how to
160 extract all the needed information, task that InfoExtractors do, it
161 has to pass the URL to one of them.
162
163 For this, YoutubeDL objects have a method that allows
164 InfoExtractors to be registered in a given order. When it is passed
165 a URL, the YoutubeDL object handles it to the first InfoExtractor it
166 finds that reports being able to handle it. The InfoExtractor extracts
167 all the information about the video or videos the URL refers to, and
168 YoutubeDL process the extracted information, possibly using a File
169 Downloader to download the video.
170
171 YoutubeDL objects accept a lot of parameters. In order not to saturate
172 the object constructor with arguments, it receives a dictionary of
173 options instead. These options are available through the params
174 attribute for the InfoExtractors to use. The YoutubeDL also
175 registers itself as the downloader in charge for the InfoExtractors
176 that are added to it, so this is a "mutual registration".
177
178 Available options:
179
180 username: Username for authentication purposes.
181 password: Password for authentication purposes.
182 videopassword: Password for accessing a video.
183 ap_mso: Adobe Pass multiple-system operator identifier.
184 ap_username: Multiple-system operator account username.
185 ap_password: Multiple-system operator account password.
186 usenetrc: Use netrc for authentication instead.
187 verbose: Print additional info to stdout.
188 quiet: Do not print messages to stdout.
189 no_warnings: Do not print out anything for warnings.
190 forceprint: A list of templates to force print
191 forceurl: Force printing final URL. (Deprecated)
192 forcetitle: Force printing title. (Deprecated)
193 forceid: Force printing ID. (Deprecated)
194 forcethumbnail: Force printing thumbnail URL. (Deprecated)
195 forcedescription: Force printing description. (Deprecated)
196 forcefilename: Force printing final filename. (Deprecated)
197 forceduration: Force printing duration. (Deprecated)
198 forcejson: Force printing info_dict as JSON.
199 dump_single_json: Force printing the info_dict of the whole playlist
200 (or video) as a single JSON line.
201 force_write_download_archive: Force writing download archive regardless
202 of 'skip_download' or 'simulate'.
203 simulate: Do not download the video files. If unset (or None),
204 simulate only if listsubtitles, listformats or list_thumbnails is used
205 format: Video format code. see "FORMAT SELECTION" for more details.
206 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
207 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
208 extracting metadata even if the video is not actually
209 available for download (experimental)
210 format_sort: How to sort the video formats. see "Sorting Formats"
211 for more details.
212 format_sort_force: Force the given format_sort. see "Sorting Formats"
213 for more details.
214 allow_multiple_video_streams: Allow multiple video streams to be merged
215 into a single file
216 allow_multiple_audio_streams: Allow multiple audio streams to be merged
217 into a single file
218 check_formats Whether to test if the formats are downloadable.
219 Can be True (check all), False (check none)
220 or None (check only if requested by extractor)
221 paths: Dictionary of output paths. The allowed keys are 'home'
222 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
223 outtmpl: Dictionary of templates for output names. Allowed keys
224 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
225 For compatibility with youtube-dl, a single string can also be used
226 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
227 restrictfilenames: Do not allow "&" and spaces in file names
228 trim_file_name: Limit length of filename (extension excluded)
229 windowsfilenames: Force the filenames to be windows compatible
230 ignoreerrors: Do not stop on download/postprocessing errors.
231 Can be 'only_download' to ignore only download errors.
232 Default is 'only_download' for CLI, but False for API
233 skip_playlist_after_errors: Number of allowed failures until the rest of
234 the playlist is skipped
235 force_generic_extractor: Force downloader to use the generic extractor
236 overwrites: Overwrite all video and metadata files if True,
237 overwrite only non-video files if None
238 and don't overwrite any file if False
239 For compatibility with youtube-dl,
240 "nooverwrites" may also be used instead
241 playliststart: Playlist item to start at.
242 playlistend: Playlist item to end at.
243 playlist_items: Specific indices of playlist to download.
244 playlistreverse: Download playlist items in reverse order.
245 playlistrandom: Download playlist items in random order.
246 matchtitle: Download only matching titles.
247 rejecttitle: Reject downloads for matching titles.
248 logger: Log messages to a logging.Logger instance.
249 logtostderr: Log messages to stderr instead of stdout.
250 writedescription: Write the video description to a .description file
251 writeinfojson: Write the video description to a .info.json file
252 clean_infojson: Remove private fields from the infojson
253 getcomments: Extract video comments. This will not be written to disk
254 unless writeinfojson is also given
255 writeannotations: Write the video annotations to a .annotations.xml file
256 writethumbnail: Write the thumbnail image to a file
257 allow_playlist_files: Whether to write playlists' description, infojson etc
258 also to disk when using the 'write*' options
259 write_all_thumbnails: Write all thumbnail formats to files
260 writelink: Write an internet shortcut file, depending on the
261 current platform (.url/.webloc/.desktop)
262 writeurllink: Write a Windows internet shortcut file (.url)
263 writewebloclink: Write a macOS internet shortcut file (.webloc)
264 writedesktoplink: Write a Linux internet shortcut file (.desktop)
265 writesubtitles: Write the video subtitles to a file
266 writeautomaticsub: Write the automatically generated subtitles to a file
267 allsubtitles: Deprecated - Use subtitleslangs = ['all']
268 Downloads all the subtitles of the video
269 (requires writesubtitles or writeautomaticsub)
270 listsubtitles: Lists all available subtitles for the video
271 subtitlesformat: The format code for subtitles
272 subtitleslangs: List of languages of the subtitles to download (can be regex).
273 The list may contain "all" to refer to all the available
274 subtitles. The language can be prefixed with a "-" to
275 exclude it from the requested languages. Eg: ['all', '-live_chat']
276 keepvideo: Keep the video file after post-processing
277 daterange: A DateRange object, download only if the upload_date is in the range.
278 skip_download: Skip the actual download of the video file
279 cachedir: Location of the cache files in the filesystem.
280 False to disable filesystem cache.
281 noplaylist: Download single video instead of a playlist if in doubt.
282 age_limit: An integer representing the user's age in years.
283 Unsuitable videos for the given age are skipped.
284 min_views: An integer representing the minimum view count the video
285 must have in order to not be skipped.
286 Videos without view count information are always
287 downloaded. None for no limit.
288 max_views: An integer representing the maximum view count.
289 Videos that are more popular than that are not
290 downloaded.
291 Videos without view count information are always
292 downloaded. None for no limit.
293 download_archive: File name of a file where all downloads are recorded.
294 Videos already present in the file are not downloaded
295 again.
296 break_on_existing: Stop the download process after attempting to download a
297 file that is in the archive.
298 break_on_reject: Stop the download process when encountering a video that
299 has been filtered out.
300 cookiefile: File name where cookies should be read from and dumped to
301 cookiesfrombrowser: A tuple containing the name of the browser and the profile
302 name/path from where cookies are loaded.
303 Eg: ('chrome', ) or (vivaldi, 'default')
304 nocheckcertificate:Do not verify SSL certificates
305 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
306 At the moment, this is only supported by YouTube.
307 proxy: URL of the proxy server to use
308 geo_verification_proxy: URL of the proxy to use for IP address verification
309 on geo-restricted sites.
310 socket_timeout: Time to wait for unresponsive hosts, in seconds
311 bidi_workaround: Work around buggy terminals without bidirectional text
312 support, using fridibi
313 debug_printtraffic:Print out sent and received HTTP traffic
314 include_ads: Download ads as well
315 default_search: Prepend this string if an input url is not valid.
316 'auto' for elaborate guessing
317 encoding: Use this encoding instead of the system-specified.
318 extract_flat: Do not resolve URLs, return the immediate result.
319 Pass in 'in_playlist' to only show this behavior for
320 playlist items.
321 postprocessors: A list of dictionaries, each with an entry
322 * key: The name of the postprocessor. See
323 yt_dlp/postprocessor/__init__.py for a list.
324 * when: When to run the postprocessor. Can be one of
325 pre_process|before_dl|post_process|after_move.
326 Assumed to be 'post_process' if not given
327 post_hooks: A list of functions that get called as the final step
328 for each video file, after all postprocessors have been
329 called. The filename will be passed as the only argument.
330 progress_hooks: A list of functions that get called on download
331 progress, with a dictionary with the entries
332 * status: One of "downloading", "error", or "finished".
333 Check this first and ignore unknown values.
334 * info_dict: The extracted info_dict
335
336 If status is one of "downloading", or "finished", the
337 following properties may also be present:
338 * filename: The final filename (always present)
339 * tmpfilename: The filename we're currently writing to
340 * downloaded_bytes: Bytes on disk
341 * total_bytes: Size of the whole file, None if unknown
342 * total_bytes_estimate: Guess of the eventual file size,
343 None if unavailable.
344 * elapsed: The number of seconds since download started.
345 * eta: The estimated time in seconds, None if unknown
346 * speed: The download speed in bytes/second, None if
347 unknown
348 * fragment_index: The counter of the currently
349 downloaded video fragment.
350 * fragment_count: The number of fragments (= individual
351 files that will be merged)
352
353 Progress hooks are guaranteed to be called at least once
354 (with status "finished") if the download is successful.
355 merge_output_format: Extension to use when merging formats.
356 final_ext: Expected final extension; used to detect when the file was
357 already downloaded and converted. "merge_output_format" is
358 replaced by this extension when given
359 fixup: Automatically correct known faults of the file.
360 One of:
361 - "never": do nothing
362 - "warn": only emit a warning
363 - "detect_or_warn": check whether we can do anything
364 about it, warn otherwise (default)
365 source_address: Client-side IP address to bind to.
366 call_home: Boolean, true iff we are allowed to contact the
367 yt-dlp servers for debugging. (BROKEN)
368 sleep_interval_requests: Number of seconds to sleep between requests
369 during extraction
370 sleep_interval: Number of seconds to sleep before each download when
371 used alone or a lower bound of a range for randomized
372 sleep before each download (minimum possible number
373 of seconds to sleep) when used along with
374 max_sleep_interval.
375 max_sleep_interval:Upper bound of a range for randomized sleep before each
376 download (maximum possible number of seconds to sleep).
377 Must only be used along with sleep_interval.
378 Actual sleep time will be a random float from range
379 [sleep_interval; max_sleep_interval].
380 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
381 listformats: Print an overview of available video formats and exit.
382 list_thumbnails: Print a table of all thumbnails and exit.
383 match_filter: A function that gets called with the info_dict of
384 every video.
385 If it returns a message, the video is ignored.
386 If it returns None, the video is downloaded.
387 match_filter_func in utils.py is one example for this.
388 no_color: Do not emit color codes in output.
389 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
390 HTTP header
391 geo_bypass_country:
392 Two-letter ISO 3166-2 country code that will be used for
393 explicit geographic restriction bypassing via faking
394 X-Forwarded-For HTTP header
395 geo_bypass_ip_block:
396 IP range in CIDR notation that will be used similarly to
397 geo_bypass_country
398
399 The following options determine which downloader is picked:
400 external_downloader: A dictionary of protocol keys and the executable of the
401 external downloader to use for it. The allowed protocols
402 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
403 Set the value to 'native' to use the native downloader
404 hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
405 or {'m3u8': 'ffmpeg'} instead.
406 Use the native HLS downloader instead of ffmpeg/avconv
407 if True, otherwise use ffmpeg/avconv if False, otherwise
408 use downloader suggested by extractor if None.
409 compat_opts: Compatibility options. See "Differences in default behavior".
410 The following options do not work when used through the API:
411 filename, abort-on-error, multistreams, no-live-chat,
412 no-clean-infojson, no-playlist-metafiles, no-keep-subs.
413 Refer __init__.py for their implementation
414
415 The following parameters are not used by YoutubeDL itself, they are used by
416 the downloader (see yt_dlp/downloader/common.py):
417 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
418 max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle,
419 xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size.
420
421 The following options are used by the post processors:
422 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
423 otherwise prefer ffmpeg. (avconv support is deprecated)
424 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
425 to the binary or its containing directory.
426 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
427 and a list of additional command-line arguments for the
428 postprocessor/executable. The dict can also have "PP+EXE" keys
429 which are used when the given exe is used by the given PP.
430 Use 'default' as the name for arguments to passed to all PP
431 For compatibility with youtube-dl, a single list of args
432 can also be used
433
434 The following options are used by the extractors:
435 extractor_retries: Number of times to retry for known errors
436 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
437 hls_split_discontinuity: Split HLS playlists to different formats at
438 discontinuities such as ad breaks (default: False)
439 extractor_args: A dictionary of arguments to be passed to the extractors.
440 See "EXTRACTOR ARGUMENTS" for details.
441 Eg: {'youtube': {'skip': ['dash', 'hls']}}
442 youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
443 If True (default), DASH manifests and related
444 data will be downloaded and processed by extractor.
445 You can reduce network I/O by disabling it if you don't
446 care about DASH. (only for youtube)
447 youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
448 If True (default), HLS manifests and related
449 data will be downloaded and processed by extractor.
450 You can reduce network I/O by disabling it if you don't
451 care about HLS. (only for youtube)
452 """
453
454 _NUMERIC_FIELDS = set((
455 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
456 'timestamp', 'upload_year', 'upload_month', 'upload_day',
457 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
458 'average_rating', 'comment_count', 'age_limit',
459 'start_time', 'end_time',
460 'chapter_number', 'season_number', 'episode_number',
461 'track_number', 'disc_number', 'release_year',
462 'playlist_index',
463 ))
464
465 params = None
466 _ies = {}
467 _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
468 _printed_messages = set()
469 _first_webpage_request = True
470 _download_retcode = None
471 _num_downloads = None
472 _playlist_level = 0
473 _playlist_urls = set()
474 _screen_file = None
475
476 def __init__(self, params=None, auto_init=True):
477 """Create a FileDownloader object with the given options."""
478 if params is None:
479 params = {}
480 self._ies = {}
481 self._ies_instances = {}
482 self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
483 self._printed_messages = set()
484 self._first_webpage_request = True
485 self._post_hooks = []
486 self._progress_hooks = []
487 self._download_retcode = 0
488 self._num_downloads = 0
489 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
490 self._err_file = sys.stderr
491 self.params = {
492 # Default parameters
493 'nocheckcertificate': False,
494 }
495 self.params.update(params)
496 self.cache = Cache(self)
497
498 if sys.version_info < (3, 6):
499 self.report_warning(
500 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
501
502 if self.params.get('allow_unplayable_formats'):
503 self.report_warning(
504 'You have asked for unplayable formats to be listed/downloaded. '
505 'This is a developer option intended for debugging. '
506 'If you experience any issues while using this option, DO NOT open a bug report')
507
508 def check_deprecated(param, option, suggestion):
509 if self.params.get(param) is not None:
510 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
511 return True
512 return False
513
514 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
515 if self.params.get('geo_verification_proxy') is None:
516 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
517
518 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
519 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
520 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
521
522 for msg in self.params.get('warnings', []):
523 self.report_warning(msg)
524
525 if self.params.get('overwrites') is None:
526 self.params.pop('overwrites', None)
527 elif self.params.get('nooverwrites') is not None:
528 # nooverwrites was unnecessarily changed to overwrites
529 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
530 # This ensures compatibility with both keys
531 self.params['overwrites'] = not self.params['nooverwrites']
532 else:
533 self.params['nooverwrites'] = not self.params['overwrites']
534
535 if params.get('bidi_workaround', False):
536 try:
537 import pty
538 master, slave = pty.openpty()
539 width = compat_get_terminal_size().columns
540 if width is None:
541 width_args = []
542 else:
543 width_args = ['-w', str(width)]
544 sp_kwargs = dict(
545 stdin=subprocess.PIPE,
546 stdout=slave,
547 stderr=self._err_file)
548 try:
549 self._output_process = subprocess.Popen(
550 ['bidiv'] + width_args, **sp_kwargs
551 )
552 except OSError:
553 self._output_process = subprocess.Popen(
554 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
555 self._output_channel = os.fdopen(master, 'rb')
556 except OSError as ose:
557 if ose.errno == errno.ENOENT:
558 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
559 else:
560 raise
561
562 if (sys.platform != 'win32'
563 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
564 and not params.get('restrictfilenames', False)):
565 # Unicode filesystem API will throw errors (#1474, #13027)
566 self.report_warning(
567 'Assuming --restrict-filenames since file system encoding '
568 'cannot encode all characters. '
569 'Set the LC_ALL environment variable to fix this.')
570 self.params['restrictfilenames'] = True
571
572 self.outtmpl_dict = self.parse_outtmpl()
573
574 # Creating format selector here allows us to catch syntax errors before the extraction
575 self.format_selector = (
576 None if self.params.get('format') is None
577 else self.build_format_selector(self.params['format']))
578
579 self._setup_opener()
580
581 """Preload the archive, if any is specified"""
582 def preload_download_archive(fn):
583 if fn is None:
584 return False
585 self.write_debug('Loading archive file %r\n' % fn)
586 try:
587 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
588 for line in archive_file:
589 self.archive.add(line.strip())
590 except IOError as ioe:
591 if ioe.errno != errno.ENOENT:
592 raise
593 return False
594 return True
595
596 self.archive = set()
597 preload_download_archive(self.params.get('download_archive'))
598
599 if auto_init:
600 self.print_debug_header()
601 self.add_default_info_extractors()
602
603 for pp_def_raw in self.params.get('postprocessors', []):
604 pp_def = dict(pp_def_raw)
605 when = pp_def.pop('when', 'post_process')
606 pp_class = get_postprocessor(pp_def.pop('key'))
607 pp = pp_class(self, **compat_kwargs(pp_def))
608 self.add_post_processor(pp, when=when)
609
610 for ph in self.params.get('post_hooks', []):
611 self.add_post_hook(ph)
612
613 for ph in self.params.get('progress_hooks', []):
614 self.add_progress_hook(ph)
615
616 register_socks_protocols()
617
618 def warn_if_short_id(self, argv):
619 # short YouTube ID starting with dash?
620 idxs = [
621 i for i, a in enumerate(argv)
622 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
623 if idxs:
624 correct_argv = (
625 ['yt-dlp']
626 + [a for i, a in enumerate(argv) if i not in idxs]
627 + ['--'] + [argv[i] for i in idxs]
628 )
629 self.report_warning(
630 'Long argument string detected. '
631 'Use -- to separate parameters and URLs, like this:\n%s\n' %
632 args_to_str(correct_argv))
633
634 def add_info_extractor(self, ie):
635 """Add an InfoExtractor object to the end of the list."""
636 ie_key = ie.ie_key()
637 self._ies[ie_key] = ie
638 if not isinstance(ie, type):
639 self._ies_instances[ie_key] = ie
640 ie.set_downloader(self)
641
642 def _get_info_extractor_class(self, ie_key):
643 ie = self._ies.get(ie_key)
644 if ie is None:
645 ie = get_info_extractor(ie_key)
646 self.add_info_extractor(ie)
647 return ie
648
649 def get_info_extractor(self, ie_key):
650 """
651 Get an instance of an IE with name ie_key, it will try to get one from
652 the _ies list, if there's no instance it will create a new one and add
653 it to the extractor list.
654 """
655 ie = self._ies_instances.get(ie_key)
656 if ie is None:
657 ie = get_info_extractor(ie_key)()
658 self.add_info_extractor(ie)
659 return ie
660
661 def add_default_info_extractors(self):
662 """
663 Add the InfoExtractors returned by gen_extractors to the end of the list
664 """
665 for ie in gen_extractor_classes():
666 self.add_info_extractor(ie)
667
668 def add_post_processor(self, pp, when='post_process'):
669 """Add a PostProcessor object to the end of the chain."""
670 self._pps[when].append(pp)
671 pp.set_downloader(self)
672
673 def add_post_hook(self, ph):
674 """Add the post hook"""
675 self._post_hooks.append(ph)
676
677 def add_progress_hook(self, ph):
678 """Add the progress hook (currently only for the file downloader)"""
679 self._progress_hooks.append(ph)
680
681 def _bidi_workaround(self, message):
682 if not hasattr(self, '_output_channel'):
683 return message
684
685 assert hasattr(self, '_output_process')
686 assert isinstance(message, compat_str)
687 line_count = message.count('\n') + 1
688 self._output_process.stdin.write((message + '\n').encode('utf-8'))
689 self._output_process.stdin.flush()
690 res = ''.join(self._output_channel.readline().decode('utf-8')
691 for _ in range(line_count))
692 return res[:-len('\n')]
693
694 def _write_string(self, message, out=None, only_once=False):
695 if only_once:
696 if message in self._printed_messages:
697 return
698 self._printed_messages.add(message)
699 write_string(message, out=out, encoding=self.params.get('encoding'))
700
701 def to_stdout(self, message, skip_eol=False, quiet=False):
702 """Print message to stdout"""
703 if self.params.get('logger'):
704 self.params['logger'].debug(message)
705 elif not quiet or self.params.get('verbose'):
706 self._write_string(
707 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
708 self._err_file if quiet else self._screen_file)
709
710 def to_stderr(self, message, only_once=False):
711 """Print message to stderr"""
712 assert isinstance(message, compat_str)
713 if self.params.get('logger'):
714 self.params['logger'].error(message)
715 else:
716 self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
717
718 def to_console_title(self, message):
719 if not self.params.get('consoletitle', False):
720 return
721 if compat_os_name == 'nt':
722 if ctypes.windll.kernel32.GetConsoleWindow():
723 # c_wchar_p() might not be necessary if `message` is
724 # already of type unicode()
725 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
726 elif 'TERM' in os.environ:
727 self._write_string('\033]0;%s\007' % message, self._screen_file)
728
729 def save_console_title(self):
730 if not self.params.get('consoletitle', False):
731 return
732 if self.params.get('simulate'):
733 return
734 if compat_os_name != 'nt' and 'TERM' in os.environ:
735 # Save the title on stack
736 self._write_string('\033[22;0t', self._screen_file)
737
738 def restore_console_title(self):
739 if not self.params.get('consoletitle', False):
740 return
741 if self.params.get('simulate'):
742 return
743 if compat_os_name != 'nt' and 'TERM' in os.environ:
744 # Restore the title from stack
745 self._write_string('\033[23;0t', self._screen_file)
746
747 def __enter__(self):
748 self.save_console_title()
749 return self
750
751 def __exit__(self, *args):
752 self.restore_console_title()
753
754 if self.params.get('cookiefile') is not None:
755 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
756
757 def trouble(self, message=None, tb=None):
758 """Determine action to take when a download problem appears.
759
760 Depending on if the downloader has been configured to ignore
761 download errors or not, this method may throw an exception or
762 not when errors are found, after printing the message.
763
764 tb, if given, is additional traceback information.
765 """
766 if message is not None:
767 self.to_stderr(message)
768 if self.params.get('verbose'):
769 if tb is None:
770 if sys.exc_info()[0]: # if .trouble has been called from an except block
771 tb = ''
772 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
773 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
774 tb += encode_compat_str(traceback.format_exc())
775 else:
776 tb_data = traceback.format_list(traceback.extract_stack())
777 tb = ''.join(tb_data)
778 if tb:
779 self.to_stderr(tb)
780 if not self.params.get('ignoreerrors'):
781 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
782 exc_info = sys.exc_info()[1].exc_info
783 else:
784 exc_info = sys.exc_info()
785 raise DownloadError(message, exc_info)
786 self._download_retcode = 1
787
788 def to_screen(self, message, skip_eol=False):
789 """Print message to stdout if not in quiet mode"""
790 self.to_stdout(
791 message, skip_eol, quiet=self.params.get('quiet', False))
792
793 def report_warning(self, message, only_once=False):
794 '''
795 Print the message to stderr, it will be prefixed with 'WARNING:'
796 If stderr is a tty file the 'WARNING:' will be colored
797 '''
798 if self.params.get('logger') is not None:
799 self.params['logger'].warning(message)
800 else:
801 if self.params.get('no_warnings'):
802 return
803 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
804 _msg_header = '\033[0;33mWARNING:\033[0m'
805 else:
806 _msg_header = 'WARNING:'
807 warning_message = '%s %s' % (_msg_header, message)
808 self.to_stderr(warning_message, only_once)
809
810 def report_error(self, message, tb=None):
811 '''
812 Do the same as trouble, but prefixes the message with 'ERROR:', colored
813 in red if stderr is a tty file.
814 '''
815 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
816 _msg_header = '\033[0;31mERROR:\033[0m'
817 else:
818 _msg_header = 'ERROR:'
819 error_message = '%s %s' % (_msg_header, message)
820 self.trouble(error_message, tb)
821
822 def write_debug(self, message, only_once=False):
823 '''Log debug message or Print message to stderr'''
824 if not self.params.get('verbose', False):
825 return
826 message = '[debug] %s' % message
827 if self.params.get('logger'):
828 self.params['logger'].debug(message)
829 else:
830 self.to_stderr(message, only_once)
831
832 def report_file_already_downloaded(self, file_name):
833 """Report file has already been fully downloaded."""
834 try:
835 self.to_screen('[download] %s has already been downloaded' % file_name)
836 except UnicodeEncodeError:
837 self.to_screen('[download] The file has already been downloaded')
838
839 def report_file_delete(self, file_name):
840 """Report that existing file will be deleted."""
841 try:
842 self.to_screen('Deleting existing file %s' % file_name)
843 except UnicodeEncodeError:
844 self.to_screen('Deleting existing file')
845
846 def raise_no_formats(self, info, forced=False):
847 has_drm = info.get('__has_drm')
848 msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
849 expected = self.params.get('ignore_no_formats_error')
850 if forced or not expected:
851 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
852 expected=has_drm or expected)
853 else:
854 self.report_warning(msg)
855
856 def parse_outtmpl(self):
857 outtmpl_dict = self.params.get('outtmpl', {})
858 if not isinstance(outtmpl_dict, dict):
859 outtmpl_dict = {'default': outtmpl_dict}
860 outtmpl_dict.update({
861 k: v for k, v in DEFAULT_OUTTMPL.items()
862 if outtmpl_dict.get(k) is None})
863 for key, val in outtmpl_dict.items():
864 if isinstance(val, bytes):
865 self.report_warning(
866 'Parameter outtmpl is bytes, but should be a unicode string. '
867 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
868 return outtmpl_dict
869
870 def get_output_path(self, dir_type='', filename=None):
871 paths = self.params.get('paths', {})
872 assert isinstance(paths, dict)
873 path = os.path.join(
874 expand_path(paths.get('home', '').strip()),
875 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
876 filename or '')
877
878 # Temporary fix for #4787
879 # 'Treat' all problem characters by passing filename through preferredencoding
880 # to workaround encoding issues with subprocess on python2 @ Windows
881 if sys.version_info < (3, 0) and sys.platform == 'win32':
882 path = encodeFilename(path, True).decode(preferredencoding())
883 return sanitize_path(path, force=self.params.get('windowsfilenames'))
884
885 @staticmethod
886 def _outtmpl_expandpath(outtmpl):
887 # expand_path translates '%%' into '%' and '$$' into '$'
888 # correspondingly that is not what we want since we need to keep
889 # '%%' intact for template dict substitution step. Working around
890 # with boundary-alike separator hack.
891 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
892 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
893
894 # outtmpl should be expand_path'ed before template dict substitution
895 # because meta fields may contain env variables we don't want to
896 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
897 # title "Hello $PATH", we don't want `$PATH` to be expanded.
898 return expand_path(outtmpl).replace(sep, '')
899
900 @staticmethod
901 def escape_outtmpl(outtmpl):
902 ''' Escape any remaining strings like %s, %abc% etc. '''
903 return re.sub(
904 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
905 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
906 outtmpl)
907
908 @classmethod
909 def validate_outtmpl(cls, outtmpl):
910 ''' @return None or Exception object '''
911 outtmpl = re.sub(
912 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'),
913 lambda mobj: f'{mobj.group(0)[:-1]}s',
914 cls._outtmpl_expandpath(outtmpl))
915 try:
916 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
917 return None
918 except ValueError as err:
919 return err
920
921 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
922 """ Make the template and info_dict suitable for substitution : ydl.outtmpl_escape(outtmpl) % info_dict """
923 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
924
925 info_dict = dict(info_dict) # Do not sanitize so as not to consume LazyList
926 for key in ('__original_infodict', '__postprocessors'):
927 info_dict.pop(key, None)
928 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
929 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
930 if info_dict.get('duration', None) is not None
931 else None)
932 info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
933 if info_dict.get('resolution') is None:
934 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
935
936 # For fields playlist_index and autonumber convert all occurrences
937 # of %(field)s to %(field)0Nd for backward compatibility
938 field_size_compat_map = {
939 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')),
940 'autonumber': self.params.get('autonumber_size') or 5,
941 }
942
943 TMPL_DICT = {}
944 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]'))
945 MATH_FUNCTIONS = {
946 '+': float.__add__,
947 '-': float.__sub__,
948 }
949 # Field is of the form key1.key2...
950 # where keys (except first) can be string, int or slice
951 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
952 MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
953 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
954 INTERNAL_FORMAT_RE = re.compile(r'''(?x)
955 (?P<negate>-)?
956 (?P<fields>{field})
957 (?P<maths>(?:{math_op}{math_field})*)
958 (?:>(?P<strf_format>.+?))?
959 (?P<alternate>(?<!\\),[^|)]+)?
960 (?:\|(?P<default>.*?))?
961 $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
962
963 def _traverse_infodict(k):
964 k = k.split('.')
965 if k[0] == '':
966 k.pop(0)
967 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
968
969 def get_value(mdict):
970 # Object traversal
971 value = _traverse_infodict(mdict['fields'])
972 # Negative
973 if mdict['negate']:
974 value = float_or_none(value)
975 if value is not None:
976 value *= -1
977 # Do maths
978 offset_key = mdict['maths']
979 if offset_key:
980 value = float_or_none(value)
981 operator = None
982 while offset_key:
983 item = re.match(
984 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
985 offset_key).group(0)
986 offset_key = offset_key[len(item):]
987 if operator is None:
988 operator = MATH_FUNCTIONS[item]
989 continue
990 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
991 offset = float_or_none(item)
992 if offset is None:
993 offset = float_or_none(_traverse_infodict(item))
994 try:
995 value = operator(value, multiplier * offset)
996 except (TypeError, ZeroDivisionError):
997 return None
998 operator = None
999 # Datetime formatting
1000 if mdict['strf_format']:
1001 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1002
1003 return value
1004
1005 na = self.params.get('outtmpl_na_placeholder', 'NA')
1006
1007 def _dumpjson_default(obj):
1008 if isinstance(obj, (set, LazyList)):
1009 return list(obj)
1010 raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
1011
1012 def create_key(outer_mobj):
1013 if not outer_mobj.group('has_key'):
1014 return f'%{outer_mobj.group(0)}'
1015 key = outer_mobj.group('key')
1016 mobj = re.match(INTERNAL_FORMAT_RE, key)
1017 initial_field = mobj.group('fields').split('.')[-1] if mobj else ''
1018 value, default = None, na
1019 while mobj:
1020 mobj = mobj.groupdict()
1021 default = mobj['default'] if mobj['default'] is not None else default
1022 value = get_value(mobj)
1023 if value is None and mobj['alternate']:
1024 mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:])
1025 else:
1026 break
1027
1028 fmt = outer_mobj.group('format')
1029 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1030 fmt = '0{:d}d'.format(field_size_compat_map[key])
1031
1032 value = default if value is None else value
1033
1034 str_fmt = f'{fmt[:-1]}s'
1035 if fmt[-1] == 'l': # list
1036 delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', '
1037 value, fmt = delim.join(variadic(value)), str_fmt
1038 elif fmt[-1] == 'j': # json
1039 value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
1040 elif fmt[-1] == 'q': # quoted
1041 value, fmt = compat_shlex_quote(str(value)), str_fmt
1042 elif fmt[-1] == 'B': # bytes
1043 value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
1044 value, fmt = value.decode('utf-8', 'ignore'), 's'
1045 elif fmt[-1] == 'U': # unicode normalized
1046 opts = outer_mobj.group('conversion') or ''
1047 value, fmt = unicodedata.normalize(
1048 # "+" = compatibility equivalence, "#" = NFD
1049 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'),
1050 value), str_fmt
1051 elif fmt[-1] == 'c':
1052 if value:
1053 value = str(value)[0]
1054 else:
1055 fmt = str_fmt
1056 elif fmt[-1] not in 'rs': # numeric
1057 value = float_or_none(value)
1058 if value is None:
1059 value, fmt = default, 's'
1060
1061 if sanitize:
1062 if fmt[-1] == 'r':
1063 # If value is an object, sanitize might convert it to a string
1064 # So we convert it to repr first
1065 value, fmt = repr(value), str_fmt
1066 if fmt[-1] in 'csr':
1067 value = sanitize(initial_field, value)
1068
1069 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1070 TMPL_DICT[key] = value
1071 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1072
1073 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1074
1075 def _prepare_filename(self, info_dict, tmpl_type='default'):
1076 try:
1077 sanitize = lambda k, v: sanitize_filename(
1078 compat_str(v),
1079 restricted=self.params.get('restrictfilenames'),
1080 is_id=(k == 'id' or k.endswith('_id')))
1081 outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
1082 outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
1083 outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl))
1084 filename = outtmpl % template_dict
1085
1086 force_ext = OUTTMPL_TYPES.get(tmpl_type)
1087 if filename and force_ext is not None:
1088 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1089
1090 # https://github.com/blackjack4494/youtube-dlc/issues/85
1091 trim_file_name = self.params.get('trim_file_name', False)
1092 if trim_file_name:
1093 fn_groups = filename.rsplit('.')
1094 ext = fn_groups[-1]
1095 sub_ext = ''
1096 if len(fn_groups) > 2:
1097 sub_ext = fn_groups[-2]
1098 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
1099
1100 return filename
1101 except ValueError as err:
1102 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1103 return None
1104
1105 def prepare_filename(self, info_dict, dir_type='', warn=False):
1106 """Generate the output filename."""
1107
1108 filename = self._prepare_filename(info_dict, dir_type or 'default')
1109 if not filename and dir_type not in ('', 'temp'):
1110 return ''
1111
1112 if warn:
1113 if not self.params.get('paths'):
1114 pass
1115 elif filename == '-':
1116 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1117 elif os.path.isabs(filename):
1118 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1119 if filename == '-' or not filename:
1120 return filename
1121
1122 return self.get_output_path(dir_type, filename)
1123
1124 def _match_entry(self, info_dict, incomplete=False, silent=False):
1125 """ Returns None if the file should be downloaded """
1126
1127 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1128
1129 def check_filter():
1130 if 'title' in info_dict:
1131 # This can happen when we're just evaluating the playlist
1132 title = info_dict['title']
1133 matchtitle = self.params.get('matchtitle', False)
1134 if matchtitle:
1135 if not re.search(matchtitle, title, re.IGNORECASE):
1136 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1137 rejecttitle = self.params.get('rejecttitle', False)
1138 if rejecttitle:
1139 if re.search(rejecttitle, title, re.IGNORECASE):
1140 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1141 date = info_dict.get('upload_date')
1142 if date is not None:
1143 dateRange = self.params.get('daterange', DateRange())
1144 if date not in dateRange:
1145 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1146 view_count = info_dict.get('view_count')
1147 if view_count is not None:
1148 min_views = self.params.get('min_views')
1149 if min_views is not None and view_count < min_views:
1150 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1151 max_views = self.params.get('max_views')
1152 if max_views is not None and view_count > max_views:
1153 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1154 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1155 return 'Skipping "%s" because it is age restricted' % video_title
1156
1157 match_filter = self.params.get('match_filter')
1158 if match_filter is not None:
1159 try:
1160 ret = match_filter(info_dict, incomplete=incomplete)
1161 except TypeError:
1162 # For backward compatibility
1163 ret = None if incomplete else match_filter(info_dict)
1164 if ret is not None:
1165 return ret
1166 return None
1167
1168 if self.in_download_archive(info_dict):
1169 reason = '%s has already been recorded in the archive' % video_title
1170 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1171 else:
1172 reason = check_filter()
1173 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1174 if reason is not None:
1175 if not silent:
1176 self.to_screen('[download] ' + reason)
1177 if self.params.get(break_opt, False):
1178 raise break_err()
1179 return reason
1180
1181 @staticmethod
1182 def add_extra_info(info_dict, extra_info):
1183 '''Set the keys from extra_info in info dict if they are missing'''
1184 for key, value in extra_info.items():
1185 info_dict.setdefault(key, value)
1186
1187 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1188 process=True, force_generic_extractor=False):
1189 """
1190 Return a list with a dictionary for each video extracted.
1191
1192 Arguments:
1193 url -- URL to extract
1194
1195 Keyword arguments:
1196 download -- whether to download videos during extraction
1197 ie_key -- extractor key hint
1198 extra_info -- dictionary containing the extra values to add to each result
1199 process -- whether to resolve all unresolved references (URLs, playlist items),
1200 must be True for download to work.
1201 force_generic_extractor -- force using the generic extractor
1202 """
1203
1204 if extra_info is None:
1205 extra_info = {}
1206
1207 if not ie_key and force_generic_extractor:
1208 ie_key = 'Generic'
1209
1210 if ie_key:
1211 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1212 else:
1213 ies = self._ies
1214
1215 for ie_key, ie in ies.items():
1216 if not ie.suitable(url):
1217 continue
1218
1219 if not ie.working():
1220 self.report_warning('The program functionality for this site has been marked as broken, '
1221 'and will probably not work.')
1222
1223 temp_id = ie.get_temp_id(url)
1224 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1225 self.to_screen("[%s] %s: has already been recorded in archive" % (
1226 ie_key, temp_id))
1227 break
1228 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1229 else:
1230 self.report_error('no suitable InfoExtractor for URL %s' % url)
1231
1232 def __handle_extraction_exceptions(func):
1233
1234 def wrapper(self, *args, **kwargs):
1235 try:
1236 return func(self, *args, **kwargs)
1237 except GeoRestrictedError as e:
1238 msg = e.msg
1239 if e.countries:
1240 msg += '\nThis video is available in %s.' % ', '.join(
1241 map(ISO3166Utils.short2full, e.countries))
1242 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1243 self.report_error(msg)
1244 except ExtractorError as e: # An error we somewhat expected
1245 self.report_error(compat_str(e), e.format_traceback())
1246 except ThrottledDownload:
1247 self.to_stderr('\r')
1248 self.report_warning('The download speed is below throttle limit. Re-extracting data')
1249 return wrapper(self, *args, **kwargs)
1250 except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError):
1251 raise
1252 except Exception as e:
1253 if self.params.get('ignoreerrors'):
1254 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1255 else:
1256 raise
1257 return wrapper
1258
1259 @__handle_extraction_exceptions
1260 def __extract_info(self, url, ie, download, extra_info, process):
1261 ie_result = ie.extract(url)
1262 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1263 return
1264 if isinstance(ie_result, list):
1265 # Backwards compatibility: old IE result format
1266 ie_result = {
1267 '_type': 'compat_list',
1268 'entries': ie_result,
1269 }
1270 if extra_info.get('original_url'):
1271 ie_result.setdefault('original_url', extra_info['original_url'])
1272 self.add_default_extra_info(ie_result, ie, url)
1273 if process:
1274 return self.process_ie_result(ie_result, download, extra_info)
1275 else:
1276 return ie_result
1277
1278 def add_default_extra_info(self, ie_result, ie, url):
1279 if url is not None:
1280 self.add_extra_info(ie_result, {
1281 'webpage_url': url,
1282 'original_url': url,
1283 'webpage_url_basename': url_basename(url),
1284 })
1285 if ie is not None:
1286 self.add_extra_info(ie_result, {
1287 'extractor': ie.IE_NAME,
1288 'extractor_key': ie.ie_key(),
1289 })
1290
1291 def process_ie_result(self, ie_result, download=True, extra_info=None):
1292 """
1293 Take the result of the ie(may be modified) and resolve all unresolved
1294 references (URLs, playlist items).
1295
1296 It will also download the videos if 'download'.
1297 Returns the resolved ie_result.
1298 """
1299 if extra_info is None:
1300 extra_info = {}
1301 result_type = ie_result.get('_type', 'video')
1302
1303 if result_type in ('url', 'url_transparent'):
1304 ie_result['url'] = sanitize_url(ie_result['url'])
1305 if ie_result.get('original_url'):
1306 extra_info.setdefault('original_url', ie_result['original_url'])
1307
1308 extract_flat = self.params.get('extract_flat', False)
1309 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1310 or extract_flat is True):
1311 info_copy = ie_result.copy()
1312 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1313 if ie and not ie_result.get('id'):
1314 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1315 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1316 self.add_extra_info(info_copy, extra_info)
1317 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1318 if self.params.get('force_write_download_archive', False):
1319 self.record_download_archive(info_copy)
1320 return ie_result
1321
1322 if result_type == 'video':
1323 self.add_extra_info(ie_result, extra_info)
1324 ie_result = self.process_video_result(ie_result, download=download)
1325 additional_urls = (ie_result or {}).get('additional_urls')
1326 if additional_urls:
1327 # TODO: Improve MetadataParserPP to allow setting a list
1328 if isinstance(additional_urls, compat_str):
1329 additional_urls = [additional_urls]
1330 self.to_screen(
1331 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1332 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1333 ie_result['additional_entries'] = [
1334 self.extract_info(
1335 url, download, extra_info,
1336 force_generic_extractor=self.params.get('force_generic_extractor'))
1337 for url in additional_urls
1338 ]
1339 return ie_result
1340 elif result_type == 'url':
1341 # We have to add extra_info to the results because it may be
1342 # contained in a playlist
1343 return self.extract_info(
1344 ie_result['url'], download,
1345 ie_key=ie_result.get('ie_key'),
1346 extra_info=extra_info)
1347 elif result_type == 'url_transparent':
1348 # Use the information from the embedding page
1349 info = self.extract_info(
1350 ie_result['url'], ie_key=ie_result.get('ie_key'),
1351 extra_info=extra_info, download=False, process=False)
1352
1353 # extract_info may return None when ignoreerrors is enabled and
1354 # extraction failed with an error, don't crash and return early
1355 # in this case
1356 if not info:
1357 return info
1358
1359 force_properties = dict(
1360 (k, v) for k, v in ie_result.items() if v is not None)
1361 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1362 if f in force_properties:
1363 del force_properties[f]
1364 new_result = info.copy()
1365 new_result.update(force_properties)
1366
1367 # Extracted info may not be a video result (i.e.
1368 # info.get('_type', 'video') != video) but rather an url or
1369 # url_transparent. In such cases outer metadata (from ie_result)
1370 # should be propagated to inner one (info). For this to happen
1371 # _type of info should be overridden with url_transparent. This
1372 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1373 if new_result.get('_type') == 'url':
1374 new_result['_type'] = 'url_transparent'
1375
1376 return self.process_ie_result(
1377 new_result, download=download, extra_info=extra_info)
1378 elif result_type in ('playlist', 'multi_video'):
1379 # Protect from infinite recursion due to recursively nested playlists
1380 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1381 webpage_url = ie_result['webpage_url']
1382 if webpage_url in self._playlist_urls:
1383 self.to_screen(
1384 '[download] Skipping already downloaded playlist: %s'
1385 % ie_result.get('title') or ie_result.get('id'))
1386 return
1387
1388 self._playlist_level += 1
1389 self._playlist_urls.add(webpage_url)
1390 self._sanitize_thumbnails(ie_result)
1391 try:
1392 return self.__process_playlist(ie_result, download)
1393 finally:
1394 self._playlist_level -= 1
1395 if not self._playlist_level:
1396 self._playlist_urls.clear()
1397 elif result_type == 'compat_list':
1398 self.report_warning(
1399 'Extractor %s returned a compat_list result. '
1400 'It needs to be updated.' % ie_result.get('extractor'))
1401
1402 def _fixup(r):
1403 self.add_extra_info(r, {
1404 'extractor': ie_result['extractor'],
1405 'webpage_url': ie_result['webpage_url'],
1406 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1407 'extractor_key': ie_result['extractor_key'],
1408 })
1409 return r
1410 ie_result['entries'] = [
1411 self.process_ie_result(_fixup(r), download, extra_info)
1412 for r in ie_result['entries']
1413 ]
1414 return ie_result
1415 else:
1416 raise Exception('Invalid result type: %s' % result_type)
1417
1418 def _ensure_dir_exists(self, path):
1419 return make_dir(path, self.report_error)
1420
1421 def __process_playlist(self, ie_result, download):
1422 # We process each entry in the playlist
1423 playlist = ie_result.get('title') or ie_result.get('id')
1424 self.to_screen('[download] Downloading playlist: %s' % playlist)
1425
1426 if 'entries' not in ie_result:
1427 raise EntryNotInPlaylist()
1428 incomplete_entries = bool(ie_result.get('requested_entries'))
1429 if incomplete_entries:
1430 def fill_missing_entries(entries, indexes):
1431 ret = [None] * max(*indexes)
1432 for i, entry in zip(indexes, entries):
1433 ret[i - 1] = entry
1434 return ret
1435 ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1436
1437 playlist_results = []
1438
1439 playliststart = self.params.get('playliststart', 1)
1440 playlistend = self.params.get('playlistend')
1441 # For backwards compatibility, interpret -1 as whole list
1442 if playlistend == -1:
1443 playlistend = None
1444
1445 playlistitems_str = self.params.get('playlist_items')
1446 playlistitems = None
1447 if playlistitems_str is not None:
1448 def iter_playlistitems(format):
1449 for string_segment in format.split(','):
1450 if '-' in string_segment:
1451 start, end = string_segment.split('-')
1452 for item in range(int(start), int(end) + 1):
1453 yield int(item)
1454 else:
1455 yield int(string_segment)
1456 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1457
1458 ie_entries = ie_result['entries']
1459 msg = (
1460 'Downloading %d videos' if not isinstance(ie_entries, list)
1461 else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1462
1463 if isinstance(ie_entries, list):
1464 def get_entry(i):
1465 return ie_entries[i - 1]
1466 else:
1467 if not isinstance(ie_entries, PagedList):
1468 ie_entries = LazyList(ie_entries)
1469
1470 def get_entry(i):
1471 return YoutubeDL.__handle_extraction_exceptions(
1472 lambda self, i: ie_entries[i - 1]
1473 )(self, i)
1474
1475 entries = []
1476 items = playlistitems if playlistitems is not None else itertools.count(playliststart)
1477 for i in items:
1478 if i == 0:
1479 continue
1480 if playlistitems is None and playlistend is not None and playlistend < i:
1481 break
1482 entry = None
1483 try:
1484 entry = get_entry(i)
1485 if entry is None:
1486 raise EntryNotInPlaylist()
1487 except (IndexError, EntryNotInPlaylist):
1488 if incomplete_entries:
1489 raise EntryNotInPlaylist()
1490 elif not playlistitems:
1491 break
1492 entries.append(entry)
1493 try:
1494 if entry is not None:
1495 self._match_entry(entry, incomplete=True, silent=True)
1496 except (ExistingVideoReached, RejectedVideoReached):
1497 break
1498 ie_result['entries'] = entries
1499
1500 # Save playlist_index before re-ordering
1501 entries = [
1502 ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry)
1503 for i, entry in enumerate(entries, 1)
1504 if entry is not None]
1505 n_entries = len(entries)
1506
1507 if not playlistitems and (playliststart or playlistend):
1508 playlistitems = list(range(playliststart, playliststart + n_entries))
1509 ie_result['requested_entries'] = playlistitems
1510
1511 if self.params.get('allow_playlist_files', True):
1512 ie_copy = {
1513 'playlist': playlist,
1514 'playlist_id': ie_result.get('id'),
1515 'playlist_title': ie_result.get('title'),
1516 'playlist_uploader': ie_result.get('uploader'),
1517 'playlist_uploader_id': ie_result.get('uploader_id'),
1518 'playlist_index': 0,
1519 }
1520 ie_copy.update(dict(ie_result))
1521
1522 if self._write_info_json('playlist', ie_result,
1523 self.prepare_filename(ie_copy, 'pl_infojson')) is None:
1524 return
1525 if self._write_description('playlist', ie_result,
1526 self.prepare_filename(ie_copy, 'pl_description')) is None:
1527 return
1528 # TODO: This should be passed to ThumbnailsConvertor if necessary
1529 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1530
1531 if self.params.get('playlistreverse', False):
1532 entries = entries[::-1]
1533 if self.params.get('playlistrandom', False):
1534 random.shuffle(entries)
1535
1536 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1537
1538 self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1539 failures = 0
1540 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1541 for i, entry_tuple in enumerate(entries, 1):
1542 playlist_index, entry = entry_tuple
1543 if 'playlist-index' in self.params.get('compat_opts', []):
1544 playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1
1545 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1546 # This __x_forwarded_for_ip thing is a bit ugly but requires
1547 # minimal changes
1548 if x_forwarded_for:
1549 entry['__x_forwarded_for_ip'] = x_forwarded_for
1550 extra = {
1551 'n_entries': n_entries,
1552 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1553 'playlist_index': playlist_index,
1554 'playlist_autonumber': i,
1555 'playlist': playlist,
1556 'playlist_id': ie_result.get('id'),
1557 'playlist_title': ie_result.get('title'),
1558 'playlist_uploader': ie_result.get('uploader'),
1559 'playlist_uploader_id': ie_result.get('uploader_id'),
1560 'extractor': ie_result['extractor'],
1561 'webpage_url': ie_result['webpage_url'],
1562 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1563 'extractor_key': ie_result['extractor_key'],
1564 }
1565
1566 if self._match_entry(entry, incomplete=True) is not None:
1567 continue
1568
1569 entry_result = self.__process_iterable_entry(entry, download, extra)
1570 if not entry_result:
1571 failures += 1
1572 if failures >= max_failures:
1573 self.report_error(
1574 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1575 break
1576 # TODO: skip failed (empty) entries?
1577 playlist_results.append(entry_result)
1578 ie_result['entries'] = playlist_results
1579 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1580 return ie_result
1581
1582 @__handle_extraction_exceptions
1583 def __process_iterable_entry(self, entry, download, extra_info):
1584 return self.process_ie_result(
1585 entry, download=download, extra_info=extra_info)
1586
1587 def _build_format_filter(self, filter_spec):
1588 " Returns a function to filter the formats according to the filter_spec "
1589
1590 OPERATORS = {
1591 '<': operator.lt,
1592 '<=': operator.le,
1593 '>': operator.gt,
1594 '>=': operator.ge,
1595 '=': operator.eq,
1596 '!=': operator.ne,
1597 }
1598 operator_rex = re.compile(r'''(?x)\s*
1599 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1600 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1601 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1602 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1603 m = operator_rex.fullmatch(filter_spec)
1604 if m:
1605 try:
1606 comparison_value = int(m.group('value'))
1607 except ValueError:
1608 comparison_value = parse_filesize(m.group('value'))
1609 if comparison_value is None:
1610 comparison_value = parse_filesize(m.group('value') + 'B')
1611 if comparison_value is None:
1612 raise ValueError(
1613 'Invalid value %r in format specification %r' % (
1614 m.group('value'), filter_spec))
1615 op = OPERATORS[m.group('op')]
1616
1617 if not m:
1618 STR_OPERATORS = {
1619 '=': operator.eq,
1620 '^=': lambda attr, value: attr.startswith(value),
1621 '$=': lambda attr, value: attr.endswith(value),
1622 '*=': lambda attr, value: value in attr,
1623 }
1624 str_operator_rex = re.compile(r'''(?x)\s*
1625 (?P<key>[a-zA-Z0-9._-]+)\s*
1626 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1627 (?P<value>[a-zA-Z0-9._-]+)\s*
1628 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1629 m = str_operator_rex.fullmatch(filter_spec)
1630 if m:
1631 comparison_value = m.group('value')
1632 str_op = STR_OPERATORS[m.group('op')]
1633 if m.group('negation'):
1634 op = lambda attr, value: not str_op(attr, value)
1635 else:
1636 op = str_op
1637
1638 if not m:
1639 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1640
1641 def _filter(f):
1642 actual_value = f.get(m.group('key'))
1643 if actual_value is None:
1644 return m.group('none_inclusive')
1645 return op(actual_value, comparison_value)
1646 return _filter
1647
1648 def _default_format_spec(self, info_dict, download=True):
1649
1650 def can_merge():
1651 merger = FFmpegMergerPP(self)
1652 return merger.available and merger.can_merge()
1653
1654 prefer_best = (
1655 not self.params.get('simulate')
1656 and download
1657 and (
1658 not can_merge()
1659 or info_dict.get('is_live', False)
1660 or self.outtmpl_dict['default'] == '-'))
1661 compat = (
1662 prefer_best
1663 or self.params.get('allow_multiple_audio_streams', False)
1664 or 'format-spec' in self.params.get('compat_opts', []))
1665
1666 return (
1667 'best/bestvideo+bestaudio' if prefer_best
1668 else 'bestvideo*+bestaudio/best' if not compat
1669 else 'bestvideo+bestaudio/best')
1670
1671 def build_format_selector(self, format_spec):
1672 def syntax_error(note, start):
1673 message = (
1674 'Invalid format specification: '
1675 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1676 return SyntaxError(message)
1677
1678 PICKFIRST = 'PICKFIRST'
1679 MERGE = 'MERGE'
1680 SINGLE = 'SINGLE'
1681 GROUP = 'GROUP'
1682 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1683
1684 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1685 'video': self.params.get('allow_multiple_video_streams', False)}
1686
1687 check_formats = self.params.get('check_formats')
1688
1689 def _parse_filter(tokens):
1690 filter_parts = []
1691 for type, string, start, _, _ in tokens:
1692 if type == tokenize.OP and string == ']':
1693 return ''.join(filter_parts)
1694 else:
1695 filter_parts.append(string)
1696
1697 def _remove_unused_ops(tokens):
1698 # Remove operators that we don't use and join them with the surrounding strings
1699 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1700 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1701 last_string, last_start, last_end, last_line = None, None, None, None
1702 for type, string, start, end, line in tokens:
1703 if type == tokenize.OP and string == '[':
1704 if last_string:
1705 yield tokenize.NAME, last_string, last_start, last_end, last_line
1706 last_string = None
1707 yield type, string, start, end, line
1708 # everything inside brackets will be handled by _parse_filter
1709 for type, string, start, end, line in tokens:
1710 yield type, string, start, end, line
1711 if type == tokenize.OP and string == ']':
1712 break
1713 elif type == tokenize.OP and string in ALLOWED_OPS:
1714 if last_string:
1715 yield tokenize.NAME, last_string, last_start, last_end, last_line
1716 last_string = None
1717 yield type, string, start, end, line
1718 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1719 if not last_string:
1720 last_string = string
1721 last_start = start
1722 last_end = end
1723 else:
1724 last_string += string
1725 if last_string:
1726 yield tokenize.NAME, last_string, last_start, last_end, last_line
1727
1728 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1729 selectors = []
1730 current_selector = None
1731 for type, string, start, _, _ in tokens:
1732 # ENCODING is only defined in python 3.x
1733 if type == getattr(tokenize, 'ENCODING', None):
1734 continue
1735 elif type in [tokenize.NAME, tokenize.NUMBER]:
1736 current_selector = FormatSelector(SINGLE, string, [])
1737 elif type == tokenize.OP:
1738 if string == ')':
1739 if not inside_group:
1740 # ')' will be handled by the parentheses group
1741 tokens.restore_last_token()
1742 break
1743 elif inside_merge and string in ['/', ',']:
1744 tokens.restore_last_token()
1745 break
1746 elif inside_choice and string == ',':
1747 tokens.restore_last_token()
1748 break
1749 elif string == ',':
1750 if not current_selector:
1751 raise syntax_error('"," must follow a format selector', start)
1752 selectors.append(current_selector)
1753 current_selector = None
1754 elif string == '/':
1755 if not current_selector:
1756 raise syntax_error('"/" must follow a format selector', start)
1757 first_choice = current_selector
1758 second_choice = _parse_format_selection(tokens, inside_choice=True)
1759 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1760 elif string == '[':
1761 if not current_selector:
1762 current_selector = FormatSelector(SINGLE, 'best', [])
1763 format_filter = _parse_filter(tokens)
1764 current_selector.filters.append(format_filter)
1765 elif string == '(':
1766 if current_selector:
1767 raise syntax_error('Unexpected "("', start)
1768 group = _parse_format_selection(tokens, inside_group=True)
1769 current_selector = FormatSelector(GROUP, group, [])
1770 elif string == '+':
1771 if not current_selector:
1772 raise syntax_error('Unexpected "+"', start)
1773 selector_1 = current_selector
1774 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1775 if not selector_2:
1776 raise syntax_error('Expected a selector', start)
1777 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1778 else:
1779 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1780 elif type == tokenize.ENDMARKER:
1781 break
1782 if current_selector:
1783 selectors.append(current_selector)
1784 return selectors
1785
1786 def _merge(formats_pair):
1787 format_1, format_2 = formats_pair
1788
1789 formats_info = []
1790 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1791 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1792
1793 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1794 get_no_more = {'video': False, 'audio': False}
1795 for (i, fmt_info) in enumerate(formats_info):
1796 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1797 formats_info.pop(i)
1798 continue
1799 for aud_vid in ['audio', 'video']:
1800 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1801 if get_no_more[aud_vid]:
1802 formats_info.pop(i)
1803 break
1804 get_no_more[aud_vid] = True
1805
1806 if len(formats_info) == 1:
1807 return formats_info[0]
1808
1809 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1810 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1811
1812 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1813 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1814
1815 output_ext = self.params.get('merge_output_format')
1816 if not output_ext:
1817 if the_only_video:
1818 output_ext = the_only_video['ext']
1819 elif the_only_audio and not video_fmts:
1820 output_ext = the_only_audio['ext']
1821 else:
1822 output_ext = 'mkv'
1823
1824 new_dict = {
1825 'requested_formats': formats_info,
1826 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1827 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1828 'ext': output_ext,
1829 }
1830
1831 if the_only_video:
1832 new_dict.update({
1833 'width': the_only_video.get('width'),
1834 'height': the_only_video.get('height'),
1835 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
1836 'fps': the_only_video.get('fps'),
1837 'vcodec': the_only_video.get('vcodec'),
1838 'vbr': the_only_video.get('vbr'),
1839 'stretched_ratio': the_only_video.get('stretched_ratio'),
1840 })
1841
1842 if the_only_audio:
1843 new_dict.update({
1844 'acodec': the_only_audio.get('acodec'),
1845 'abr': the_only_audio.get('abr'),
1846 })
1847
1848 return new_dict
1849
1850 def _check_formats(formats):
1851 if not check_formats:
1852 yield from formats
1853 return
1854 for f in formats:
1855 self.to_screen('[info] Testing format %s' % f['format_id'])
1856 temp_file = tempfile.NamedTemporaryFile(
1857 suffix='.tmp', delete=False,
1858 dir=self.get_output_path('temp') or None)
1859 temp_file.close()
1860 try:
1861 success, _ = self.dl(temp_file.name, f, test=True)
1862 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1863 success = False
1864 finally:
1865 if os.path.exists(temp_file.name):
1866 try:
1867 os.remove(temp_file.name)
1868 except OSError:
1869 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1870 if success:
1871 yield f
1872 else:
1873 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1874
1875 def _build_selector_function(selector):
1876 if isinstance(selector, list): # ,
1877 fs = [_build_selector_function(s) for s in selector]
1878
1879 def selector_function(ctx):
1880 for f in fs:
1881 yield from f(ctx)
1882 return selector_function
1883
1884 elif selector.type == GROUP: # ()
1885 selector_function = _build_selector_function(selector.selector)
1886
1887 elif selector.type == PICKFIRST: # /
1888 fs = [_build_selector_function(s) for s in selector.selector]
1889
1890 def selector_function(ctx):
1891 for f in fs:
1892 picked_formats = list(f(ctx))
1893 if picked_formats:
1894 return picked_formats
1895 return []
1896
1897 elif selector.type == MERGE: # +
1898 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1899
1900 def selector_function(ctx):
1901 for pair in itertools.product(
1902 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1903 yield _merge(pair)
1904
1905 elif selector.type == SINGLE: # atom
1906 format_spec = selector.selector or 'best'
1907
1908 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
1909 if format_spec == 'all':
1910 def selector_function(ctx):
1911 yield from _check_formats(ctx['formats'])
1912 elif format_spec == 'mergeall':
1913 def selector_function(ctx):
1914 formats = list(_check_formats(ctx['formats']))
1915 if not formats:
1916 return
1917 merged_format = formats[-1]
1918 for f in formats[-2::-1]:
1919 merged_format = _merge((merged_format, f))
1920 yield merged_format
1921
1922 else:
1923 format_fallback, format_reverse, format_idx = False, True, 1
1924 mobj = re.match(
1925 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
1926 format_spec)
1927 if mobj is not None:
1928 format_idx = int_or_none(mobj.group('n'), default=1)
1929 format_reverse = mobj.group('bw')[0] == 'b'
1930 format_type = (mobj.group('type') or [None])[0]
1931 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
1932 format_modified = mobj.group('mod') is not None
1933
1934 format_fallback = not format_type and not format_modified # for b, w
1935 _filter_f = (
1936 (lambda f: f.get('%scodec' % format_type) != 'none')
1937 if format_type and format_modified # bv*, ba*, wv*, wa*
1938 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
1939 if format_type # bv, ba, wv, wa
1940 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1941 if not format_modified # b, w
1942 else lambda f: True) # b*, w*
1943 filter_f = lambda f: _filter_f(f) and (
1944 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
1945 else:
1946 filter_f = ((lambda f: f.get('ext') == format_spec)
1947 if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension
1948 else (lambda f: f.get('format_id') == format_spec)) # id
1949
1950 def selector_function(ctx):
1951 formats = list(ctx['formats'])
1952 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1953 if format_fallback and ctx['incomplete_formats'] and not matches:
1954 # for extractors with incomplete formats (audio only (soundcloud)
1955 # or video only (imgur)) best/worst will fallback to
1956 # best/worst {video,audio}-only format
1957 matches = formats
1958 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
1959 try:
1960 yield matches[format_idx - 1]
1961 except IndexError:
1962 return
1963
1964 filters = [self._build_format_filter(f) for f in selector.filters]
1965
1966 def final_selector(ctx):
1967 ctx_copy = copy.deepcopy(ctx)
1968 for _filter in filters:
1969 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1970 return selector_function(ctx_copy)
1971 return final_selector
1972
1973 stream = io.BytesIO(format_spec.encode('utf-8'))
1974 try:
1975 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1976 except tokenize.TokenError:
1977 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1978
1979 class TokenIterator(object):
1980 def __init__(self, tokens):
1981 self.tokens = tokens
1982 self.counter = 0
1983
1984 def __iter__(self):
1985 return self
1986
1987 def __next__(self):
1988 if self.counter >= len(self.tokens):
1989 raise StopIteration()
1990 value = self.tokens[self.counter]
1991 self.counter += 1
1992 return value
1993
1994 next = __next__
1995
1996 def restore_last_token(self):
1997 self.counter -= 1
1998
1999 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2000 return _build_selector_function(parsed_selector)
2001
2002 def _calc_headers(self, info_dict):
2003 res = std_headers.copy()
2004
2005 add_headers = info_dict.get('http_headers')
2006 if add_headers:
2007 res.update(add_headers)
2008
2009 cookies = self._calc_cookies(info_dict)
2010 if cookies:
2011 res['Cookie'] = cookies
2012
2013 if 'X-Forwarded-For' not in res:
2014 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2015 if x_forwarded_for_ip:
2016 res['X-Forwarded-For'] = x_forwarded_for_ip
2017
2018 return res
2019
2020 def _calc_cookies(self, info_dict):
2021 pr = sanitized_Request(info_dict['url'])
2022 self.cookiejar.add_cookie_header(pr)
2023 return pr.get_header('Cookie')
2024
2025 def _sanitize_thumbnails(self, info_dict):
2026 thumbnails = info_dict.get('thumbnails')
2027 if thumbnails is None:
2028 thumbnail = info_dict.get('thumbnail')
2029 if thumbnail:
2030 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2031 if thumbnails:
2032 thumbnails.sort(key=lambda t: (
2033 t.get('preference') if t.get('preference') is not None else -1,
2034 t.get('width') if t.get('width') is not None else -1,
2035 t.get('height') if t.get('height') is not None else -1,
2036 t.get('id') if t.get('id') is not None else '',
2037 t.get('url')))
2038
2039 def thumbnail_tester():
2040 if self.params.get('check_formats'):
2041 test_all = True
2042 to_screen = lambda msg: self.to_screen(f'[info] {msg}')
2043 else:
2044 test_all = False
2045 to_screen = self.write_debug
2046
2047 def test_thumbnail(t):
2048 if not test_all and not t.get('_test_url'):
2049 return True
2050 to_screen('Testing thumbnail %s' % t['id'])
2051 try:
2052 self.urlopen(HEADRequest(t['url']))
2053 except network_exceptions as err:
2054 to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % (
2055 t['id'], t['url'], error_to_compat_str(err)))
2056 return False
2057 return True
2058
2059 return test_thumbnail
2060
2061 for i, t in enumerate(thumbnails):
2062 if t.get('id') is None:
2063 t['id'] = '%d' % i
2064 if t.get('width') and t.get('height'):
2065 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2066 t['url'] = sanitize_url(t['url'])
2067
2068 if self.params.get('check_formats') is not False:
2069 info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse()
2070 else:
2071 info_dict['thumbnails'] = thumbnails
2072
2073 def process_video_result(self, info_dict, download=True):
2074 assert info_dict.get('_type', 'video') == 'video'
2075
2076 if 'id' not in info_dict:
2077 raise ExtractorError('Missing "id" field in extractor result')
2078 if 'title' not in info_dict:
2079 raise ExtractorError('Missing "title" field in extractor result',
2080 video_id=info_dict['id'], ie=info_dict['extractor'])
2081
2082 def report_force_conversion(field, field_not, conversion):
2083 self.report_warning(
2084 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2085 % (field, field_not, conversion))
2086
2087 def sanitize_string_field(info, string_field):
2088 field = info.get(string_field)
2089 if field is None or isinstance(field, compat_str):
2090 return
2091 report_force_conversion(string_field, 'a string', 'string')
2092 info[string_field] = compat_str(field)
2093
2094 def sanitize_numeric_fields(info):
2095 for numeric_field in self._NUMERIC_FIELDS:
2096 field = info.get(numeric_field)
2097 if field is None or isinstance(field, compat_numeric_types):
2098 continue
2099 report_force_conversion(numeric_field, 'numeric', 'int')
2100 info[numeric_field] = int_or_none(field)
2101
2102 sanitize_string_field(info_dict, 'id')
2103 sanitize_numeric_fields(info_dict)
2104
2105 if 'playlist' not in info_dict:
2106 # It isn't part of a playlist
2107 info_dict['playlist'] = None
2108 info_dict['playlist_index'] = None
2109
2110 self._sanitize_thumbnails(info_dict)
2111
2112 thumbnail = info_dict.get('thumbnail')
2113 thumbnails = info_dict.get('thumbnails')
2114 if thumbnail:
2115 info_dict['thumbnail'] = sanitize_url(thumbnail)
2116 elif thumbnails:
2117 info_dict['thumbnail'] = thumbnails[-1]['url']
2118
2119 if info_dict.get('display_id') is None and 'id' in info_dict:
2120 info_dict['display_id'] = info_dict['id']
2121
2122 for ts_key, date_key in (
2123 ('timestamp', 'upload_date'),
2124 ('release_timestamp', 'release_date'),
2125 ):
2126 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2127 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2128 # see http://bugs.python.org/issue1646728)
2129 try:
2130 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2131 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2132 except (ValueError, OverflowError, OSError):
2133 pass
2134
2135 live_keys = ('is_live', 'was_live')
2136 live_status = info_dict.get('live_status')
2137 if live_status is None:
2138 for key in live_keys:
2139 if info_dict.get(key) is False:
2140 continue
2141 if info_dict.get(key):
2142 live_status = key
2143 break
2144 if all(info_dict.get(key) is False for key in live_keys):
2145 live_status = 'not_live'
2146 if live_status:
2147 info_dict['live_status'] = live_status
2148 for key in live_keys:
2149 if info_dict.get(key) is None:
2150 info_dict[key] = (live_status == key)
2151
2152 # Auto generate title fields corresponding to the *_number fields when missing
2153 # in order to always have clean titles. This is very common for TV series.
2154 for field in ('chapter', 'season', 'episode'):
2155 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2156 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2157
2158 for cc_kind in ('subtitles', 'automatic_captions'):
2159 cc = info_dict.get(cc_kind)
2160 if cc:
2161 for _, subtitle in cc.items():
2162 for subtitle_format in subtitle:
2163 if subtitle_format.get('url'):
2164 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2165 if subtitle_format.get('ext') is None:
2166 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2167
2168 automatic_captions = info_dict.get('automatic_captions')
2169 subtitles = info_dict.get('subtitles')
2170
2171 info_dict['requested_subtitles'] = self.process_subtitles(
2172 info_dict['id'], subtitles, automatic_captions)
2173
2174 # We now pick which formats have to be downloaded
2175 if info_dict.get('formats') is None:
2176 # There's only one format available
2177 formats = [info_dict]
2178 else:
2179 formats = info_dict['formats']
2180
2181 info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
2182 if not self.params.get('allow_unplayable_formats'):
2183 formats = [f for f in formats if not f.get('has_drm')]
2184
2185 if not formats:
2186 self.raise_no_formats(info_dict)
2187
2188 def is_wellformed(f):
2189 url = f.get('url')
2190 if not url:
2191 self.report_warning(
2192 '"url" field is missing or empty - skipping format, '
2193 'there is an error in extractor')
2194 return False
2195 if isinstance(url, bytes):
2196 sanitize_string_field(f, 'url')
2197 return True
2198
2199 # Filter out malformed formats for better extraction robustness
2200 formats = list(filter(is_wellformed, formats))
2201
2202 formats_dict = {}
2203
2204 # We check that all the formats have the format and format_id fields
2205 for i, format in enumerate(formats):
2206 sanitize_string_field(format, 'format_id')
2207 sanitize_numeric_fields(format)
2208 format['url'] = sanitize_url(format['url'])
2209 if not format.get('format_id'):
2210 format['format_id'] = compat_str(i)
2211 else:
2212 # Sanitize format_id from characters used in format selector expression
2213 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2214 format_id = format['format_id']
2215 if format_id not in formats_dict:
2216 formats_dict[format_id] = []
2217 formats_dict[format_id].append(format)
2218
2219 # Make sure all formats have unique format_id
2220 for format_id, ambiguous_formats in formats_dict.items():
2221 if len(ambiguous_formats) > 1:
2222 for i, format in enumerate(ambiguous_formats):
2223 format['format_id'] = '%s-%d' % (format_id, i)
2224
2225 for i, format in enumerate(formats):
2226 if format.get('format') is None:
2227 format['format'] = '{id} - {res}{note}'.format(
2228 id=format['format_id'],
2229 res=self.format_resolution(format),
2230 note=format_field(format, 'format_note', ' (%s)'),
2231 )
2232 # Automatically determine file extension if missing
2233 if format.get('ext') is None:
2234 format['ext'] = determine_ext(format['url']).lower()
2235 # Automatically determine protocol if missing (useful for format
2236 # selection purposes)
2237 if format.get('protocol') is None:
2238 format['protocol'] = determine_protocol(format)
2239 # Add HTTP headers, so that external programs can use them from the
2240 # json output
2241 full_format_info = info_dict.copy()
2242 full_format_info.update(format)
2243 format['http_headers'] = self._calc_headers(full_format_info)
2244 # Remove private housekeeping stuff
2245 if '__x_forwarded_for_ip' in info_dict:
2246 del info_dict['__x_forwarded_for_ip']
2247
2248 # TODO Central sorting goes here
2249
2250 if not formats or formats[0] is not info_dict:
2251 # only set the 'formats' fields if the original info_dict list them
2252 # otherwise we end up with a circular reference, the first (and unique)
2253 # element in the 'formats' field in info_dict is info_dict itself,
2254 # which can't be exported to json
2255 info_dict['formats'] = formats
2256
2257 info_dict, _ = self.pre_process(info_dict)
2258
2259 if self.params.get('list_thumbnails'):
2260 self.list_thumbnails(info_dict)
2261 if self.params.get('listformats'):
2262 if not info_dict.get('formats') and not info_dict.get('url'):
2263 self.to_screen('%s has no formats' % info_dict['id'])
2264 else:
2265 self.list_formats(info_dict)
2266 if self.params.get('listsubtitles'):
2267 if 'automatic_captions' in info_dict:
2268 self.list_subtitles(
2269 info_dict['id'], automatic_captions, 'automatic captions')
2270 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2271 list_only = self.params.get('simulate') is None and (
2272 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2273 if list_only:
2274 # Without this printing, -F --print-json will not work
2275 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2276 return
2277
2278 format_selector = self.format_selector
2279 if format_selector is None:
2280 req_format = self._default_format_spec(info_dict, download=download)
2281 self.write_debug('Default format spec: %s' % req_format)
2282 format_selector = self.build_format_selector(req_format)
2283
2284 # While in format selection we may need to have an access to the original
2285 # format set in order to calculate some metrics or do some processing.
2286 # For now we need to be able to guess whether original formats provided
2287 # by extractor are incomplete or not (i.e. whether extractor provides only
2288 # video-only or audio-only formats) for proper formats selection for
2289 # extractors with such incomplete formats (see
2290 # https://github.com/ytdl-org/youtube-dl/pull/5556).
2291 # Since formats may be filtered during format selection and may not match
2292 # the original formats the results may be incorrect. Thus original formats
2293 # or pre-calculated metrics should be passed to format selection routines
2294 # as well.
2295 # We will pass a context object containing all necessary additional data
2296 # instead of just formats.
2297 # This fixes incorrect format selection issue (see
2298 # https://github.com/ytdl-org/youtube-dl/issues/10083).
2299 incomplete_formats = (
2300 # All formats are video-only or
2301 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2302 # all formats are audio-only
2303 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2304
2305 ctx = {
2306 'formats': formats,
2307 'incomplete_formats': incomplete_formats,
2308 }
2309
2310 formats_to_download = list(format_selector(ctx))
2311 if not formats_to_download:
2312 if not self.params.get('ignore_no_formats_error'):
2313 raise ExtractorError('Requested format is not available', expected=True,
2314 video_id=info_dict['id'], ie=info_dict['extractor'])
2315 else:
2316 self.report_warning('Requested format is not available')
2317 # Process what we can, even without any available formats.
2318 self.process_info(dict(info_dict))
2319 elif download:
2320 self.to_screen(
2321 '[info] %s: Downloading %d format(s): %s' % (
2322 info_dict['id'], len(formats_to_download),
2323 ", ".join([f['format_id'] for f in formats_to_download])))
2324 for fmt in formats_to_download:
2325 new_info = dict(info_dict)
2326 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2327 new_info['__original_infodict'] = info_dict
2328 new_info.update(fmt)
2329 self.process_info(new_info)
2330 # We update the info dict with the best quality format (backwards compatibility)
2331 if formats_to_download:
2332 info_dict.update(formats_to_download[-1])
2333 return info_dict
2334
2335 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2336 """Select the requested subtitles and their format"""
2337 available_subs = {}
2338 if normal_subtitles and self.params.get('writesubtitles'):
2339 available_subs.update(normal_subtitles)
2340 if automatic_captions and self.params.get('writeautomaticsub'):
2341 for lang, cap_info in automatic_captions.items():
2342 if lang not in available_subs:
2343 available_subs[lang] = cap_info
2344
2345 if (not self.params.get('writesubtitles') and not
2346 self.params.get('writeautomaticsub') or not
2347 available_subs):
2348 return None
2349
2350 all_sub_langs = available_subs.keys()
2351 if self.params.get('allsubtitles', False):
2352 requested_langs = all_sub_langs
2353 elif self.params.get('subtitleslangs', False):
2354 # A list is used so that the order of languages will be the same as
2355 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2356 requested_langs = []
2357 for lang_re in self.params.get('subtitleslangs'):
2358 if lang_re == 'all':
2359 requested_langs.extend(all_sub_langs)
2360 continue
2361 discard = lang_re[0] == '-'
2362 if discard:
2363 lang_re = lang_re[1:]
2364 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2365 if discard:
2366 for lang in current_langs:
2367 while lang in requested_langs:
2368 requested_langs.remove(lang)
2369 else:
2370 requested_langs.extend(current_langs)
2371 requested_langs = orderedSet(requested_langs)
2372 elif 'en' in available_subs:
2373 requested_langs = ['en']
2374 else:
2375 requested_langs = [list(all_sub_langs)[0]]
2376 if requested_langs:
2377 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2378
2379 formats_query = self.params.get('subtitlesformat', 'best')
2380 formats_preference = formats_query.split('/') if formats_query else []
2381 subs = {}
2382 for lang in requested_langs:
2383 formats = available_subs.get(lang)
2384 if formats is None:
2385 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2386 continue
2387 for ext in formats_preference:
2388 if ext == 'best':
2389 f = formats[-1]
2390 break
2391 matches = list(filter(lambda f: f['ext'] == ext, formats))
2392 if matches:
2393 f = matches[-1]
2394 break
2395 else:
2396 f = formats[-1]
2397 self.report_warning(
2398 'No subtitle format found matching "%s" for language %s, '
2399 'using %s' % (formats_query, lang, f['ext']))
2400 subs[lang] = f
2401 return subs
2402
2403 def __forced_printings(self, info_dict, filename, incomplete):
2404 def print_mandatory(field, actual_field=None):
2405 if actual_field is None:
2406 actual_field = field
2407 if (self.params.get('force%s' % field, False)
2408 and (not incomplete or info_dict.get(actual_field) is not None)):
2409 self.to_stdout(info_dict[actual_field])
2410
2411 def print_optional(field):
2412 if (self.params.get('force%s' % field, False)
2413 and info_dict.get(field) is not None):
2414 self.to_stdout(info_dict[field])
2415
2416 info_dict = info_dict.copy()
2417 if filename is not None:
2418 info_dict['filename'] = filename
2419 if info_dict.get('requested_formats') is not None:
2420 # For RTMP URLs, also include the playpath
2421 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2422 elif 'url' in info_dict:
2423 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2424
2425 if self.params.get('forceprint') or self.params.get('forcejson'):
2426 self.post_extract(info_dict)
2427 for tmpl in self.params.get('forceprint', []):
2428 if re.match(r'\w+$', tmpl):
2429 tmpl = '%({})s'.format(tmpl)
2430 tmpl, info_copy = self.prepare_outtmpl(tmpl, info_dict)
2431 self.to_stdout(self.escape_outtmpl(tmpl) % info_copy)
2432
2433 print_mandatory('title')
2434 print_mandatory('id')
2435 print_mandatory('url', 'urls')
2436 print_optional('thumbnail')
2437 print_optional('description')
2438 print_optional('filename')
2439 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2440 self.to_stdout(formatSeconds(info_dict['duration']))
2441 print_mandatory('format')
2442
2443 if self.params.get('forcejson'):
2444 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2445
2446 def dl(self, name, info, subtitle=False, test=False):
2447 if not info.get('url'):
2448 self.raise_no_formats(info, True)
2449
2450 if test:
2451 verbose = self.params.get('verbose')
2452 params = {
2453 'test': True,
2454 'quiet': not verbose,
2455 'verbose': verbose,
2456 'noprogress': not verbose,
2457 'nopart': True,
2458 'skip_unavailable_fragments': False,
2459 'keep_fragments': False,
2460 'overwrites': True,
2461 '_no_ytdl_file': True,
2462 }
2463 else:
2464 params = self.params
2465 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2466 if not test:
2467 for ph in self._progress_hooks:
2468 fd.add_progress_hook(ph)
2469 urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2470 self.write_debug('Invoking downloader on "%s"' % urls)
2471 new_info = dict(info)
2472 if new_info.get('http_headers') is None:
2473 new_info['http_headers'] = self._calc_headers(new_info)
2474 return fd.download(name, new_info, subtitle)
2475
2476 def process_info(self, info_dict):
2477 """Process a single resolved IE result."""
2478
2479 assert info_dict.get('_type', 'video') == 'video'
2480
2481 max_downloads = self.params.get('max_downloads')
2482 if max_downloads is not None:
2483 if self._num_downloads >= int(max_downloads):
2484 raise MaxDownloadsReached()
2485
2486 # TODO: backward compatibility, to be removed
2487 info_dict['fulltitle'] = info_dict['title']
2488
2489 if 'format' not in info_dict and 'ext' in info_dict:
2490 info_dict['format'] = info_dict['ext']
2491
2492 if self._match_entry(info_dict) is not None:
2493 return
2494
2495 self.post_extract(info_dict)
2496 self._num_downloads += 1
2497
2498 # info_dict['_filename'] needs to be set for backward compatibility
2499 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2500 temp_filename = self.prepare_filename(info_dict, 'temp')
2501 files_to_move = {}
2502
2503 # Forced printings
2504 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2505
2506 if self.params.get('simulate'):
2507 if self.params.get('force_write_download_archive', False):
2508 self.record_download_archive(info_dict)
2509 # Do nothing else if in simulate mode
2510 return
2511
2512 if full_filename is None:
2513 return
2514 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2515 return
2516 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2517 return
2518
2519 if self._write_description('video', info_dict,
2520 self.prepare_filename(info_dict, 'description')) is None:
2521 return
2522
2523 sub_files = self._write_subtitles(info_dict, temp_filename)
2524 if sub_files is None:
2525 return
2526 files_to_move.update(dict(sub_files))
2527
2528 thumb_files = self._write_thumbnails(
2529 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2530 if thumb_files is None:
2531 return
2532 files_to_move.update(dict(thumb_files))
2533
2534 infofn = self.prepare_filename(info_dict, 'infojson')
2535 _infojson_written = self._write_info_json('video', info_dict, infofn)
2536 if _infojson_written:
2537 info_dict['__infojson_filename'] = infofn
2538 elif _infojson_written is None:
2539 return
2540
2541 # Note: Annotations are deprecated
2542 annofn = None
2543 if self.params.get('writeannotations', False):
2544 annofn = self.prepare_filename(info_dict, 'annotation')
2545 if annofn:
2546 if not self._ensure_dir_exists(encodeFilename(annofn)):
2547 return
2548 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2549 self.to_screen('[info] Video annotations are already present')
2550 elif not info_dict.get('annotations'):
2551 self.report_warning('There are no annotations to write.')
2552 else:
2553 try:
2554 self.to_screen('[info] Writing video annotations to: ' + annofn)
2555 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2556 annofile.write(info_dict['annotations'])
2557 except (KeyError, TypeError):
2558 self.report_warning('There are no annotations to write.')
2559 except (OSError, IOError):
2560 self.report_error('Cannot write annotations file: ' + annofn)
2561 return
2562
2563 # Write internet shortcut files
2564 url_link = webloc_link = desktop_link = False
2565 if self.params.get('writelink', False):
2566 if sys.platform == "darwin": # macOS.
2567 webloc_link = True
2568 elif sys.platform.startswith("linux"):
2569 desktop_link = True
2570 else: # if sys.platform in ['win32', 'cygwin']:
2571 url_link = True
2572 if self.params.get('writeurllink', False):
2573 url_link = True
2574 if self.params.get('writewebloclink', False):
2575 webloc_link = True
2576 if self.params.get('writedesktoplink', False):
2577 desktop_link = True
2578
2579 if url_link or webloc_link or desktop_link:
2580 if 'webpage_url' not in info_dict:
2581 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2582 return
2583 ascii_url = iri_to_uri(info_dict['webpage_url'])
2584
2585 def _write_link_file(extension, template, newline, embed_filename):
2586 linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
2587 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2588 self.to_screen('[info] Internet shortcut is already present')
2589 else:
2590 try:
2591 self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
2592 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
2593 template_vars = {'url': ascii_url}
2594 if embed_filename:
2595 template_vars['filename'] = linkfn[:-(len(extension) + 1)]
2596 linkfile.write(template % template_vars)
2597 except (OSError, IOError):
2598 self.report_error('Cannot write internet shortcut ' + linkfn)
2599 return False
2600 return True
2601
2602 if url_link:
2603 if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
2604 return
2605 if webloc_link:
2606 if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
2607 return
2608 if desktop_link:
2609 if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
2610 return
2611
2612 try:
2613 info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2614 except PostProcessingError as err:
2615 self.report_error('Preprocessing: %s' % str(err))
2616 return
2617
2618 must_record_download_archive = False
2619 if self.params.get('skip_download', False):
2620 info_dict['filepath'] = temp_filename
2621 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2622 info_dict['__files_to_move'] = files_to_move
2623 info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2624 else:
2625 # Download
2626 info_dict.setdefault('__postprocessors', [])
2627 try:
2628
2629 def existing_file(*filepaths):
2630 ext = info_dict.get('ext')
2631 final_ext = self.params.get('final_ext', ext)
2632 existing_files = []
2633 for file in orderedSet(filepaths):
2634 if final_ext != ext:
2635 converted = replace_extension(file, final_ext, ext)
2636 if os.path.exists(encodeFilename(converted)):
2637 existing_files.append(converted)
2638 if os.path.exists(encodeFilename(file)):
2639 existing_files.append(file)
2640
2641 if not existing_files or self.params.get('overwrites', False):
2642 for file in orderedSet(existing_files):
2643 self.report_file_delete(file)
2644 os.remove(encodeFilename(file))
2645 return None
2646
2647 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2648 return existing_files[0]
2649
2650 success = True
2651 if info_dict.get('requested_formats') is not None:
2652
2653 def compatible_formats(formats):
2654 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2655 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2656 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2657 if len(video_formats) > 2 or len(audio_formats) > 2:
2658 return False
2659
2660 # Check extension
2661 exts = set(format.get('ext') for format in formats)
2662 COMPATIBLE_EXTS = (
2663 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2664 set(('webm',)),
2665 )
2666 for ext_sets in COMPATIBLE_EXTS:
2667 if ext_sets.issuperset(exts):
2668 return True
2669 # TODO: Check acodec/vcodec
2670 return False
2671
2672 requested_formats = info_dict['requested_formats']
2673 old_ext = info_dict['ext']
2674 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2675 info_dict['ext'] = 'mkv'
2676 self.report_warning(
2677 'Requested formats are incompatible for merge and will be merged into mkv.')
2678 new_ext = info_dict['ext']
2679
2680 def correct_ext(filename, ext=new_ext):
2681 if filename == '-':
2682 return filename
2683 filename_real_ext = os.path.splitext(filename)[1][1:]
2684 filename_wo_ext = (
2685 os.path.splitext(filename)[0]
2686 if filename_real_ext in (old_ext, new_ext)
2687 else filename)
2688 return '%s.%s' % (filename_wo_ext, ext)
2689
2690 # Ensure filename always has a correct extension for successful merge
2691 full_filename = correct_ext(full_filename)
2692 temp_filename = correct_ext(temp_filename)
2693 dl_filename = existing_file(full_filename, temp_filename)
2694 info_dict['__real_download'] = False
2695
2696 _protocols = set(determine_protocol(f) for f in requested_formats)
2697 if len(_protocols) == 1: # All requested formats have same protocol
2698 info_dict['protocol'] = _protocols.pop()
2699 directly_mergable = FFmpegFD.can_merge_formats(info_dict, self.params)
2700 if dl_filename is not None:
2701 self.report_file_already_downloaded(dl_filename)
2702 elif (directly_mergable and get_suitable_downloader(
2703 info_dict, self.params, to_stdout=(temp_filename == '-')) == FFmpegFD):
2704 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2705 success, real_download = self.dl(temp_filename, info_dict)
2706 info_dict['__real_download'] = real_download
2707 else:
2708 downloaded = []
2709 merger = FFmpegMergerPP(self)
2710 if self.params.get('allow_unplayable_formats'):
2711 self.report_warning(
2712 'You have requested merging of multiple formats '
2713 'while also allowing unplayable formats to be downloaded. '
2714 'The formats won\'t be merged to prevent data corruption.')
2715 elif not merger.available:
2716 self.report_warning(
2717 'You have requested merging of multiple formats but ffmpeg is not installed. '
2718 'The formats won\'t be merged.')
2719
2720 if temp_filename == '-':
2721 reason = ('using a downloader other than ffmpeg' if directly_mergable
2722 else 'but the formats are incompatible for simultaneous download' if merger.available
2723 else 'but ffmpeg is not installed')
2724 self.report_warning(
2725 f'You have requested downloading multiple formats to stdout {reason}. '
2726 'The formats will be streamed one after the other')
2727 fname = temp_filename
2728 for f in requested_formats:
2729 new_info = dict(info_dict)
2730 del new_info['requested_formats']
2731 new_info.update(f)
2732 if temp_filename != '-':
2733 fname = prepend_extension(
2734 correct_ext(temp_filename, new_info['ext']),
2735 'f%s' % f['format_id'], new_info['ext'])
2736 if not self._ensure_dir_exists(fname):
2737 return
2738 f['filepath'] = fname
2739 downloaded.append(fname)
2740 partial_success, real_download = self.dl(fname, new_info)
2741 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2742 success = success and partial_success
2743 if merger.available and not self.params.get('allow_unplayable_formats'):
2744 info_dict['__postprocessors'].append(merger)
2745 info_dict['__files_to_merge'] = downloaded
2746 # Even if there were no downloads, it is being merged only now
2747 info_dict['__real_download'] = True
2748 else:
2749 for file in downloaded:
2750 files_to_move[file] = None
2751 else:
2752 # Just a single file
2753 dl_filename = existing_file(full_filename, temp_filename)
2754 if dl_filename is None or dl_filename == temp_filename:
2755 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
2756 # So we should try to resume the download
2757 success, real_download = self.dl(temp_filename, info_dict)
2758 info_dict['__real_download'] = real_download
2759 else:
2760 self.report_file_already_downloaded(dl_filename)
2761
2762 dl_filename = dl_filename or temp_filename
2763 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2764
2765 except network_exceptions as err:
2766 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2767 return
2768 except (OSError, IOError) as err:
2769 raise UnavailableVideoError(err)
2770 except (ContentTooShortError, ) as err:
2771 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2772 return
2773
2774 if success and full_filename != '-':
2775
2776 def fixup():
2777 do_fixup = True
2778 fixup_policy = self.params.get('fixup')
2779 vid = info_dict['id']
2780
2781 if fixup_policy in ('ignore', 'never'):
2782 return
2783 elif fixup_policy == 'warn':
2784 do_fixup = False
2785 elif fixup_policy != 'force':
2786 assert fixup_policy in ('detect_or_warn', None)
2787 if not info_dict.get('__real_download'):
2788 do_fixup = False
2789
2790 def ffmpeg_fixup(cndn, msg, cls):
2791 if not cndn:
2792 return
2793 if not do_fixup:
2794 self.report_warning(f'{vid}: {msg}')
2795 return
2796 pp = cls(self)
2797 if pp.available:
2798 info_dict['__postprocessors'].append(pp)
2799 else:
2800 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
2801
2802 stretched_ratio = info_dict.get('stretched_ratio')
2803 ffmpeg_fixup(
2804 stretched_ratio not in (1, None),
2805 f'Non-uniform pixel ratio {stretched_ratio}',
2806 FFmpegFixupStretchedPP)
2807
2808 ffmpeg_fixup(
2809 (info_dict.get('requested_formats') is None
2810 and info_dict.get('container') == 'm4a_dash'
2811 and info_dict.get('ext') == 'm4a'),
2812 'writing DASH m4a. Only some players support this container',
2813 FFmpegFixupM4aPP)
2814
2815 downloader = (get_suitable_downloader(info_dict, self.params).__name__
2816 if 'protocol' in info_dict else None)
2817 ffmpeg_fixup(downloader == 'HlsFD', 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
2818 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
2819 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
2820
2821 fixup()
2822 try:
2823 info_dict = self.post_process(dl_filename, info_dict, files_to_move)
2824 except PostProcessingError as err:
2825 self.report_error('Postprocessing: %s' % str(err))
2826 return
2827 try:
2828 for ph in self._post_hooks:
2829 ph(info_dict['filepath'])
2830 except Exception as err:
2831 self.report_error('post hooks: %s' % str(err))
2832 return
2833 must_record_download_archive = True
2834
2835 if must_record_download_archive or self.params.get('force_write_download_archive', False):
2836 self.record_download_archive(info_dict)
2837 max_downloads = self.params.get('max_downloads')
2838 if max_downloads is not None and self._num_downloads >= int(max_downloads):
2839 raise MaxDownloadsReached()
2840
2841 def download(self, url_list):
2842 """Download a given list of URLs."""
2843 outtmpl = self.outtmpl_dict['default']
2844 if (len(url_list) > 1
2845 and outtmpl != '-'
2846 and '%' not in outtmpl
2847 and self.params.get('max_downloads') != 1):
2848 raise SameFileError(outtmpl)
2849
2850 for url in url_list:
2851 try:
2852 # It also downloads the videos
2853 res = self.extract_info(
2854 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2855 except UnavailableVideoError:
2856 self.report_error('unable to download video')
2857 except MaxDownloadsReached:
2858 self.to_screen('[info] Maximum number of downloads reached')
2859 raise
2860 except ExistingVideoReached:
2861 self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing')
2862 raise
2863 except RejectedVideoReached:
2864 self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject')
2865 raise
2866 else:
2867 if self.params.get('dump_single_json', False):
2868 self.post_extract(res)
2869 self.to_stdout(json.dumps(self.sanitize_info(res)))
2870
2871 return self._download_retcode
2872
2873 def download_with_info_file(self, info_filename):
2874 with contextlib.closing(fileinput.FileInput(
2875 [info_filename], mode='r',
2876 openhook=fileinput.hook_encoded('utf-8'))) as f:
2877 # FileInput doesn't have a read method, we can't call json.load
2878 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
2879 try:
2880 self.process_ie_result(info, download=True)
2881 except (DownloadError, EntryNotInPlaylist, ThrottledDownload):
2882 webpage_url = info.get('webpage_url')
2883 if webpage_url is not None:
2884 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2885 return self.download([webpage_url])
2886 else:
2887 raise
2888 return self._download_retcode
2889
2890 @staticmethod
2891 def sanitize_info(info_dict, remove_private_keys=False):
2892 ''' Sanitize the infodict for converting to json '''
2893 if info_dict is None:
2894 return info_dict
2895 info_dict.setdefault('epoch', int(time.time()))
2896 remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
2897 keep_keys = ['_type'], # Always keep this to facilitate load-info-json
2898 if remove_private_keys:
2899 remove_keys |= {
2900 'requested_formats', 'requested_subtitles', 'requested_entries',
2901 'filepath', 'entries', 'original_url', 'playlist_autonumber',
2902 }
2903 empty_values = (None, {}, [], set(), tuple())
2904 reject = lambda k, v: k not in keep_keys and (
2905 k.startswith('_') or k in remove_keys or v in empty_values)
2906 else:
2907 reject = lambda k, v: k in remove_keys
2908 filter_fn = lambda obj: (
2909 list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
2910 else obj if not isinstance(obj, dict)
2911 else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
2912 return filter_fn(info_dict)
2913
2914 @staticmethod
2915 def filter_requested_info(info_dict, actually_filter=True):
2916 ''' Alias of sanitize_info for backward compatibility '''
2917 return YoutubeDL.sanitize_info(info_dict, actually_filter)
2918
2919 def run_pp(self, pp, infodict):
2920 files_to_delete = []
2921 if '__files_to_move' not in infodict:
2922 infodict['__files_to_move'] = {}
2923 try:
2924 files_to_delete, infodict = pp.run(infodict)
2925 except PostProcessingError as e:
2926 # Must be True and not 'only_download'
2927 if self.params.get('ignoreerrors') is True:
2928 self.report_error(e)
2929 return infodict
2930 raise
2931
2932 if not files_to_delete:
2933 return infodict
2934 if self.params.get('keepvideo', False):
2935 for f in files_to_delete:
2936 infodict['__files_to_move'].setdefault(f, '')
2937 else:
2938 for old_filename in set(files_to_delete):
2939 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2940 try:
2941 os.remove(encodeFilename(old_filename))
2942 except (IOError, OSError):
2943 self.report_warning('Unable to remove downloaded original file')
2944 if old_filename in infodict['__files_to_move']:
2945 del infodict['__files_to_move'][old_filename]
2946 return infodict
2947
2948 @staticmethod
2949 def post_extract(info_dict):
2950 def actual_post_extract(info_dict):
2951 if info_dict.get('_type') in ('playlist', 'multi_video'):
2952 for video_dict in info_dict.get('entries', {}):
2953 actual_post_extract(video_dict or {})
2954 return
2955
2956 post_extractor = info_dict.get('__post_extractor') or (lambda: {})
2957 extra = post_extractor().items()
2958 info_dict.update(extra)
2959 info_dict.pop('__post_extractor', None)
2960
2961 original_infodict = info_dict.get('__original_infodict') or {}
2962 original_infodict.update(extra)
2963 original_infodict.pop('__post_extractor', None)
2964
2965 actual_post_extract(info_dict or {})
2966
2967 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
2968 info = dict(ie_info)
2969 info['__files_to_move'] = files_to_move or {}
2970 for pp in self._pps[key]:
2971 info = self.run_pp(pp, info)
2972 return info, info.pop('__files_to_move', None)
2973
2974 def post_process(self, filename, ie_info, files_to_move=None):
2975 """Run all the postprocessors on the given file."""
2976 info = dict(ie_info)
2977 info['filepath'] = filename
2978 info['__files_to_move'] = files_to_move or {}
2979
2980 for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
2981 info = self.run_pp(pp, info)
2982 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
2983 del info['__files_to_move']
2984 for pp in self._pps['after_move']:
2985 info = self.run_pp(pp, info)
2986 return info
2987
2988 def _make_archive_id(self, info_dict):
2989 video_id = info_dict.get('id')
2990 if not video_id:
2991 return
2992 # Future-proof against any change in case
2993 # and backwards compatibility with prior versions
2994 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
2995 if extractor is None:
2996 url = str_or_none(info_dict.get('url'))
2997 if not url:
2998 return
2999 # Try to find matching extractor for the URL and take its ie_key
3000 for ie_key, ie in self._ies.items():
3001 if ie.suitable(url):
3002 extractor = ie_key
3003 break
3004 else:
3005 return
3006 return '%s %s' % (extractor.lower(), video_id)
3007
3008 def in_download_archive(self, info_dict):
3009 fn = self.params.get('download_archive')
3010 if fn is None:
3011 return False
3012
3013 vid_id = self._make_archive_id(info_dict)
3014 if not vid_id:
3015 return False # Incomplete video information
3016
3017 return vid_id in self.archive
3018
3019 def record_download_archive(self, info_dict):
3020 fn = self.params.get('download_archive')
3021 if fn is None:
3022 return
3023 vid_id = self._make_archive_id(info_dict)
3024 assert vid_id
3025 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3026 archive_file.write(vid_id + '\n')
3027 self.archive.add(vid_id)
3028
3029 @staticmethod
3030 def format_resolution(format, default='unknown'):
3031 if format.get('vcodec') == 'none':
3032 if format.get('acodec') == 'none':
3033 return 'images'
3034 return 'audio only'
3035 if format.get('resolution') is not None:
3036 return format['resolution']
3037 if format.get('width') and format.get('height'):
3038 res = '%dx%d' % (format['width'], format['height'])
3039 elif format.get('height'):
3040 res = '%sp' % format['height']
3041 elif format.get('width'):
3042 res = '%dx?' % format['width']
3043 else:
3044 res = default
3045 return res
3046
3047 def _format_note(self, fdict):
3048 res = ''
3049 if fdict.get('ext') in ['f4f', 'f4m']:
3050 res += '(unsupported) '
3051 if fdict.get('language'):
3052 if res:
3053 res += ' '
3054 res += '[%s] ' % fdict['language']
3055 if fdict.get('format_note') is not None:
3056 res += fdict['format_note'] + ' '
3057 if fdict.get('tbr') is not None:
3058 res += '%4dk ' % fdict['tbr']
3059 if fdict.get('container') is not None:
3060 if res:
3061 res += ', '
3062 res += '%s container' % fdict['container']
3063 if (fdict.get('vcodec') is not None
3064 and fdict.get('vcodec') != 'none'):
3065 if res:
3066 res += ', '
3067 res += fdict['vcodec']
3068 if fdict.get('vbr') is not None:
3069 res += '@'
3070 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3071 res += 'video@'
3072 if fdict.get('vbr') is not None:
3073 res += '%4dk' % fdict['vbr']
3074 if fdict.get('fps') is not None:
3075 if res:
3076 res += ', '
3077 res += '%sfps' % fdict['fps']
3078 if fdict.get('acodec') is not None:
3079 if res:
3080 res += ', '
3081 if fdict['acodec'] == 'none':
3082 res += 'video only'
3083 else:
3084 res += '%-5s' % fdict['acodec']
3085 elif fdict.get('abr') is not None:
3086 if res:
3087 res += ', '
3088 res += 'audio'
3089 if fdict.get('abr') is not None:
3090 res += '@%3dk' % fdict['abr']
3091 if fdict.get('asr') is not None:
3092 res += ' (%5dHz)' % fdict['asr']
3093 if fdict.get('filesize') is not None:
3094 if res:
3095 res += ', '
3096 res += format_bytes(fdict['filesize'])
3097 elif fdict.get('filesize_approx') is not None:
3098 if res:
3099 res += ', '
3100 res += '~' + format_bytes(fdict['filesize_approx'])
3101 return res
3102
3103 def list_formats(self, info_dict):
3104 formats = info_dict.get('formats', [info_dict])
3105 new_format = (
3106 'list-formats' not in self.params.get('compat_opts', [])
3107 and self.params.get('listformats_table', True) is not False)
3108 if new_format:
3109 table = [
3110 [
3111 format_field(f, 'format_id'),
3112 format_field(f, 'ext'),
3113 self.format_resolution(f),
3114 format_field(f, 'fps', '%d'),
3115 '|',
3116 format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
3117 format_field(f, 'tbr', '%4dk'),
3118 shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
3119 '|',
3120 format_field(f, 'vcodec', default='unknown').replace('none', ''),
3121 format_field(f, 'vbr', '%4dk'),
3122 format_field(f, 'acodec', default='unknown').replace('none', ''),
3123 format_field(f, 'abr', '%3dk'),
3124 format_field(f, 'asr', '%5dHz'),
3125 ', '.join(filter(None, (
3126 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
3127 format_field(f, 'language', '[%s]'),
3128 format_field(f, 'format_note'),
3129 format_field(f, 'container', ignore=(None, f.get('ext'))),
3130 ))),
3131 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3132 header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO',
3133 '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO']
3134 else:
3135 table = [
3136 [
3137 format_field(f, 'format_id'),
3138 format_field(f, 'ext'),
3139 self.format_resolution(f),
3140 self._format_note(f)]
3141 for f in formats
3142 if f.get('preference') is None or f['preference'] >= -1000]
3143 header_line = ['format code', 'extension', 'resolution', 'note']
3144
3145 self.to_screen(
3146 '[info] Available formats for %s:' % info_dict['id'])
3147 self.to_stdout(render_table(
3148 header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format))
3149
3150 def list_thumbnails(self, info_dict):
3151 thumbnails = list(info_dict.get('thumbnails'))
3152 if not thumbnails:
3153 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3154 return
3155
3156 self.to_screen(
3157 '[info] Thumbnails for %s:' % info_dict['id'])
3158 self.to_stdout(render_table(
3159 ['ID', 'width', 'height', 'URL'],
3160 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3161
3162 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3163 if not subtitles:
3164 self.to_screen('%s has no %s' % (video_id, name))
3165 return
3166 self.to_screen(
3167 'Available %s for %s:' % (name, video_id))
3168
3169 def _row(lang, formats):
3170 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3171 if len(set(names)) == 1:
3172 names = [] if names[0] == 'unknown' else names[:1]
3173 return [lang, ', '.join(names), ', '.join(exts)]
3174
3175 self.to_stdout(render_table(
3176 ['Language', 'Name', 'Formats'],
3177 [_row(lang, formats) for lang, formats in subtitles.items()],
3178 hideEmpty=True))
3179
3180 def urlopen(self, req):
3181 """ Start an HTTP download """
3182 if isinstance(req, compat_basestring):
3183 req = sanitized_Request(req)
3184 return self._opener.open(req, timeout=self._socket_timeout)
3185
3186 def print_debug_header(self):
3187 if not self.params.get('verbose'):
3188 return
3189
3190 stdout_encoding = getattr(
3191 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
3192 encoding_str = (
3193 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
3194 locale.getpreferredencoding(),
3195 sys.getfilesystemencoding(),
3196 stdout_encoding,
3197 self.get_encoding()))
3198 write_string(encoding_str, encoding=None)
3199
3200 source = detect_variant()
3201 self._write_string('[debug] yt-dlp version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})'))
3202 if _LAZY_LOADER:
3203 self._write_string('[debug] Lazy loading extractors enabled\n')
3204 if _PLUGIN_CLASSES:
3205 self._write_string(
3206 '[debug] Plugin Extractors: %s\n' % [ie.ie_key() for ie in _PLUGIN_CLASSES])
3207 if self.params.get('compat_opts'):
3208 self._write_string(
3209 '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts')))
3210 try:
3211 sp = subprocess.Popen(
3212 ['git', 'rev-parse', '--short', 'HEAD'],
3213 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3214 cwd=os.path.dirname(os.path.abspath(__file__)))
3215 out, err = process_communicate_or_kill(sp)
3216 out = out.decode().strip()
3217 if re.match('[0-9a-f]+', out):
3218 self._write_string('[debug] Git HEAD: %s\n' % out)
3219 except Exception:
3220 try:
3221 sys.exc_clear()
3222 except Exception:
3223 pass
3224
3225 def python_implementation():
3226 impl_name = platform.python_implementation()
3227 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3228 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3229 return impl_name
3230
3231 self._write_string('[debug] Python version %s (%s %s) - %s\n' % (
3232 platform.python_version(),
3233 python_implementation(),
3234 platform.architecture()[0],
3235 platform_name()))
3236
3237 exe_versions = FFmpegPostProcessor.get_versions(self)
3238 exe_versions['rtmpdump'] = rtmpdump_version()
3239 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3240 exe_str = ', '.join(
3241 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3242 ) or 'none'
3243 self._write_string('[debug] exe versions: %s\n' % exe_str)
3244
3245 from .downloader.websocket import has_websockets
3246 from .postprocessor.embedthumbnail import has_mutagen
3247 from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
3248
3249 lib_str = ', '.join(sorted(filter(None, (
3250 compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0],
3251 has_websockets and 'websockets',
3252 has_mutagen and 'mutagen',
3253 SQLITE_AVAILABLE and 'sqlite',
3254 KEYRING_AVAILABLE and 'keyring',
3255 )))) or 'none'
3256 self._write_string('[debug] Optional libraries: %s\n' % lib_str)
3257
3258 proxy_map = {}
3259 for handler in self._opener.handlers:
3260 if hasattr(handler, 'proxies'):
3261 proxy_map.update(handler.proxies)
3262 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
3263
3264 if self.params.get('call_home', False):
3265 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3266 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
3267 return
3268 latest_version = self.urlopen(
3269 'https://yt-dl.org/latest/version').read().decode('utf-8')
3270 if version_tuple(latest_version) > version_tuple(__version__):
3271 self.report_warning(
3272 'You are using an outdated version (newest version: %s)! '
3273 'See https://yt-dl.org/update if you need help updating.' %
3274 latest_version)
3275
3276 def _setup_opener(self):
3277 timeout_val = self.params.get('socket_timeout')
3278 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
3279
3280 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3281 opts_cookiefile = self.params.get('cookiefile')
3282 opts_proxy = self.params.get('proxy')
3283
3284 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3285
3286 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3287 if opts_proxy is not None:
3288 if opts_proxy == '':
3289 proxies = {}
3290 else:
3291 proxies = {'http': opts_proxy, 'https': opts_proxy}
3292 else:
3293 proxies = compat_urllib_request.getproxies()
3294 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3295 if 'http' in proxies and 'https' not in proxies:
3296 proxies['https'] = proxies['http']
3297 proxy_handler = PerRequestProxyHandler(proxies)
3298
3299 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3300 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3301 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3302 redirect_handler = YoutubeDLRedirectHandler()
3303 data_handler = compat_urllib_request_DataHandler()
3304
3305 # When passing our own FileHandler instance, build_opener won't add the
3306 # default FileHandler and allows us to disable the file protocol, which
3307 # can be used for malicious purposes (see
3308 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3309 file_handler = compat_urllib_request.FileHandler()
3310
3311 def file_open(*args, **kwargs):
3312 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3313 file_handler.file_open = file_open
3314
3315 opener = compat_urllib_request.build_opener(
3316 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3317
3318 # Delete the default user-agent header, which would otherwise apply in
3319 # cases where our custom HTTP handler doesn't come into play
3320 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3321 opener.addheaders = []
3322 self._opener = opener
3323
3324 def encode(self, s):
3325 if isinstance(s, bytes):
3326 return s # Already encoded
3327
3328 try:
3329 return s.encode(self.get_encoding())
3330 except UnicodeEncodeError as err:
3331 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3332 raise
3333
3334 def get_encoding(self):
3335 encoding = self.params.get('encoding')
3336 if encoding is None:
3337 encoding = preferredencoding()
3338 return encoding
3339
3340 def _write_info_json(self, label, ie_result, infofn):
3341 ''' Write infojson and returns True = written, False = skip, None = error '''
3342 if not self.params.get('writeinfojson'):
3343 return False
3344 elif not infofn:
3345 self.write_debug(f'Skipping writing {label} infojson')
3346 return False
3347 elif not self._ensure_dir_exists(infofn):
3348 return None
3349 elif not self.params.get('overwrites', True) and os.path.exists(infofn):
3350 self.to_screen(f'[info] {label.title()} metadata is already present')
3351 else:
3352 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3353 try:
3354 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3355 except (OSError, IOError):
3356 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3357 return None
3358 return True
3359
3360 def _write_description(self, label, ie_result, descfn):
3361 ''' Write description and returns True = written, False = skip, None = error '''
3362 if not self.params.get('writedescription'):
3363 return False
3364 elif not descfn:
3365 self.write_debug(f'Skipping writing {label} description')
3366 return False
3367 elif not self._ensure_dir_exists(descfn):
3368 return None
3369 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3370 self.to_screen(f'[info] {label.title()} description is already present')
3371 elif ie_result.get('description') is None:
3372 self.report_warning(f'There\'s no {label} description to write')
3373 return False
3374 else:
3375 try:
3376 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3377 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3378 descfile.write(ie_result['description'])
3379 except (OSError, IOError):
3380 self.report_error(f'Cannot write {label} description file {descfn}')
3381 return None
3382 return True
3383
3384 def _write_subtitles(self, info_dict, filename):
3385 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3386 ret = []
3387 subtitles = info_dict.get('requested_subtitles')
3388 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3389 # subtitles download errors are already managed as troubles in relevant IE
3390 # that way it will silently go on when used with unsupporting IE
3391 return ret
3392
3393 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3394 if not sub_filename_base:
3395 self.to_screen('[info] Skipping writing video subtitles')
3396 return ret
3397 for sub_lang, sub_info in subtitles.items():
3398 sub_format = sub_info['ext']
3399 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3400 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3401 if not self.params.get('overwrites', True) and os.path.exists(sub_filename):
3402 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3403 sub_info['filepath'] = sub_filename
3404 ret.append((sub_filename, sub_filename_final))
3405 continue
3406
3407 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3408 if sub_info.get('data') is not None:
3409 try:
3410 # Use newline='' to prevent conversion of newline characters
3411 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3412 with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3413 subfile.write(sub_info['data'])
3414 sub_info['filepath'] = sub_filename
3415 ret.append((sub_filename, sub_filename_final))
3416 continue
3417 except (OSError, IOError):
3418 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3419 return None
3420
3421 try:
3422 sub_copy = sub_info.copy()
3423 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3424 self.dl(sub_filename, sub_copy, subtitle=True)
3425 sub_info['filepath'] = sub_filename
3426 ret.append((sub_filename, sub_filename_final))
3427 except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3428 self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
3429 continue
3430 return ret
3431
3432 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3433 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3434 write_all = self.params.get('write_all_thumbnails', False)
3435 thumbnails, ret = [], []
3436 if write_all or self.params.get('writethumbnail', False):
3437 thumbnails = info_dict.get('thumbnails') or []
3438 multiple = write_all and len(thumbnails) > 1
3439
3440 if thumb_filename_base is None:
3441 thumb_filename_base = filename
3442 if thumbnails and not thumb_filename_base:
3443 self.write_debug(f'Skipping writing {label} thumbnail')
3444 return ret
3445
3446 for t in thumbnails[::-1]:
3447 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3448 thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '')
3449 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3450 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3451
3452 if not self.params.get('overwrites', True) and os.path.exists(thumb_filename):
3453 ret.append((thumb_filename, thumb_filename_final))
3454 t['filepath'] = thumb_filename
3455 self.to_screen(f'[info] {thumb_display_id.title()} is already present')
3456 else:
3457 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3458 try:
3459 uf = self.urlopen(t['url'])
3460 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3461 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3462 shutil.copyfileobj(uf, thumbf)
3463 ret.append((thumb_filename, thumb_filename_final))
3464 t['filepath'] = thumb_filename
3465 except network_exceptions as err:
3466 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3467 if ret and not write_all:
3468 break
3469 return ret