]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
[docs,cleanup] Some minor refactoring and improve docs
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import sys
23 import tempfile
24 import time
25 import tokenize
26 import traceback
27 import random
28 import unicodedata
29
30 from string import ascii_letters
31
32 from .compat import (
33 compat_basestring,
34 compat_get_terminal_size,
35 compat_kwargs,
36 compat_numeric_types,
37 compat_os_name,
38 compat_pycrypto_AES,
39 compat_shlex_quote,
40 compat_str,
41 compat_tokenize_tokenize,
42 compat_urllib_error,
43 compat_urllib_request,
44 compat_urllib_request_DataHandler,
45 )
46 from .cookies import load_cookies
47 from .utils import (
48 age_restricted,
49 args_to_str,
50 ContentTooShortError,
51 date_from_str,
52 DateRange,
53 DEFAULT_OUTTMPL,
54 determine_ext,
55 determine_protocol,
56 DOT_DESKTOP_LINK_TEMPLATE,
57 DOT_URL_LINK_TEMPLATE,
58 DOT_WEBLOC_LINK_TEMPLATE,
59 DownloadError,
60 encode_compat_str,
61 encodeFilename,
62 EntryNotInPlaylist,
63 error_to_compat_str,
64 ExistingVideoReached,
65 expand_path,
66 ExtractorError,
67 float_or_none,
68 format_bytes,
69 format_field,
70 STR_FORMAT_RE_TMPL,
71 STR_FORMAT_TYPES,
72 formatSeconds,
73 GeoRestrictedError,
74 HEADRequest,
75 int_or_none,
76 iri_to_uri,
77 ISO3166Utils,
78 LazyList,
79 locked_file,
80 make_dir,
81 make_HTTPS_handler,
82 MaxDownloadsReached,
83 network_exceptions,
84 orderedSet,
85 OUTTMPL_TYPES,
86 PagedList,
87 parse_filesize,
88 PerRequestProxyHandler,
89 platform_name,
90 PostProcessingError,
91 preferredencoding,
92 prepend_extension,
93 process_communicate_or_kill,
94 register_socks_protocols,
95 RejectedVideoReached,
96 render_table,
97 replace_extension,
98 SameFileError,
99 sanitize_filename,
100 sanitize_path,
101 sanitize_url,
102 sanitized_Request,
103 std_headers,
104 str_or_none,
105 strftime_or_none,
106 subtitles_filename,
107 ThrottledDownload,
108 to_high_limit_path,
109 traverse_obj,
110 try_get,
111 UnavailableVideoError,
112 url_basename,
113 variadic,
114 version_tuple,
115 write_json_file,
116 write_string,
117 YoutubeDLCookieProcessor,
118 YoutubeDLHandler,
119 YoutubeDLRedirectHandler,
120 )
121 from .cache import Cache
122 from .extractor import (
123 gen_extractor_classes,
124 get_info_extractor,
125 _LAZY_LOADER,
126 _PLUGIN_CLASSES as plugin_extractors
127 )
128 from .extractor.openload import PhantomJSwrapper
129 from .downloader import (
130 FFmpegFD,
131 get_suitable_downloader,
132 shorten_protocol_name
133 )
134 from .downloader.rtmp import rtmpdump_version
135 from .postprocessor import (
136 get_postprocessor,
137 FFmpegFixupDurationPP,
138 FFmpegFixupM3u8PP,
139 FFmpegFixupM4aPP,
140 FFmpegFixupStretchedPP,
141 FFmpegFixupTimestampPP,
142 FFmpegMergerPP,
143 FFmpegPostProcessor,
144 MoveFilesAfterDownloadPP,
145 _PLUGIN_CLASSES as plugin_postprocessors
146 )
147 from .update import detect_variant
148 from .version import __version__
149
150 if compat_os_name == 'nt':
151 import ctypes
152
153
154 class YoutubeDL(object):
155 """YoutubeDL class.
156
157 YoutubeDL objects are the ones responsible of downloading the
158 actual video file and writing it to disk if the user has requested
159 it, among some other tasks. In most cases there should be one per
160 program. As, given a video URL, the downloader doesn't know how to
161 extract all the needed information, task that InfoExtractors do, it
162 has to pass the URL to one of them.
163
164 For this, YoutubeDL objects have a method that allows
165 InfoExtractors to be registered in a given order. When it is passed
166 a URL, the YoutubeDL object handles it to the first InfoExtractor it
167 finds that reports being able to handle it. The InfoExtractor extracts
168 all the information about the video or videos the URL refers to, and
169 YoutubeDL process the extracted information, possibly using a File
170 Downloader to download the video.
171
172 YoutubeDL objects accept a lot of parameters. In order not to saturate
173 the object constructor with arguments, it receives a dictionary of
174 options instead. These options are available through the params
175 attribute for the InfoExtractors to use. The YoutubeDL also
176 registers itself as the downloader in charge for the InfoExtractors
177 that are added to it, so this is a "mutual registration".
178
179 Available options:
180
181 username: Username for authentication purposes.
182 password: Password for authentication purposes.
183 videopassword: Password for accessing a video.
184 ap_mso: Adobe Pass multiple-system operator identifier.
185 ap_username: Multiple-system operator account username.
186 ap_password: Multiple-system operator account password.
187 usenetrc: Use netrc for authentication instead.
188 verbose: Print additional info to stdout.
189 quiet: Do not print messages to stdout.
190 no_warnings: Do not print out anything for warnings.
191 forceprint: A list of templates to force print
192 forceurl: Force printing final URL. (Deprecated)
193 forcetitle: Force printing title. (Deprecated)
194 forceid: Force printing ID. (Deprecated)
195 forcethumbnail: Force printing thumbnail URL. (Deprecated)
196 forcedescription: Force printing description. (Deprecated)
197 forcefilename: Force printing final filename. (Deprecated)
198 forceduration: Force printing duration. (Deprecated)
199 forcejson: Force printing info_dict as JSON.
200 dump_single_json: Force printing the info_dict of the whole playlist
201 (or video) as a single JSON line.
202 force_write_download_archive: Force writing download archive regardless
203 of 'skip_download' or 'simulate'.
204 simulate: Do not download the video files. If unset (or None),
205 simulate only if listsubtitles, listformats or list_thumbnails is used
206 format: Video format code. see "FORMAT SELECTION" for more details.
207 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
208 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
209 extracting metadata even if the video is not actually
210 available for download (experimental)
211 format_sort: How to sort the video formats. see "Sorting Formats"
212 for more details.
213 format_sort_force: Force the given format_sort. see "Sorting Formats"
214 for more details.
215 allow_multiple_video_streams: Allow multiple video streams to be merged
216 into a single file
217 allow_multiple_audio_streams: Allow multiple audio streams to be merged
218 into a single file
219 check_formats Whether to test if the formats are downloadable.
220 Can be True (check all), False (check none)
221 or None (check only if requested by extractor)
222 paths: Dictionary of output paths. The allowed keys are 'home'
223 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
224 outtmpl: Dictionary of templates for output names. Allowed keys
225 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
226 For compatibility with youtube-dl, a single string can also be used
227 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
228 restrictfilenames: Do not allow "&" and spaces in file names
229 trim_file_name: Limit length of filename (extension excluded)
230 windowsfilenames: Force the filenames to be windows compatible
231 ignoreerrors: Do not stop on download/postprocessing errors.
232 Can be 'only_download' to ignore only download errors.
233 Default is 'only_download' for CLI, but False for API
234 skip_playlist_after_errors: Number of allowed failures until the rest of
235 the playlist is skipped
236 force_generic_extractor: Force downloader to use the generic extractor
237 overwrites: Overwrite all video and metadata files if True,
238 overwrite only non-video files if None
239 and don't overwrite any file if False
240 For compatibility with youtube-dl,
241 "nooverwrites" may also be used instead
242 playliststart: Playlist item to start at.
243 playlistend: Playlist item to end at.
244 playlist_items: Specific indices of playlist to download.
245 playlistreverse: Download playlist items in reverse order.
246 playlistrandom: Download playlist items in random order.
247 matchtitle: Download only matching titles.
248 rejecttitle: Reject downloads for matching titles.
249 logger: Log messages to a logging.Logger instance.
250 logtostderr: Log messages to stderr instead of stdout.
251 writedescription: Write the video description to a .description file
252 writeinfojson: Write the video description to a .info.json file
253 clean_infojson: Remove private fields from the infojson
254 getcomments: Extract video comments. This will not be written to disk
255 unless writeinfojson is also given
256 writeannotations: Write the video annotations to a .annotations.xml file
257 writethumbnail: Write the thumbnail image to a file
258 allow_playlist_files: Whether to write playlists' description, infojson etc
259 also to disk when using the 'write*' options
260 write_all_thumbnails: Write all thumbnail formats to files
261 writelink: Write an internet shortcut file, depending on the
262 current platform (.url/.webloc/.desktop)
263 writeurllink: Write a Windows internet shortcut file (.url)
264 writewebloclink: Write a macOS internet shortcut file (.webloc)
265 writedesktoplink: Write a Linux internet shortcut file (.desktop)
266 writesubtitles: Write the video subtitles to a file
267 writeautomaticsub: Write the automatically generated subtitles to a file
268 allsubtitles: Deprecated - Use subtitleslangs = ['all']
269 Downloads all the subtitles of the video
270 (requires writesubtitles or writeautomaticsub)
271 listsubtitles: Lists all available subtitles for the video
272 subtitlesformat: The format code for subtitles
273 subtitleslangs: List of languages of the subtitles to download (can be regex).
274 The list may contain "all" to refer to all the available
275 subtitles. The language can be prefixed with a "-" to
276 exclude it from the requested languages. Eg: ['all', '-live_chat']
277 keepvideo: Keep the video file after post-processing
278 daterange: A DateRange object, download only if the upload_date is in the range.
279 skip_download: Skip the actual download of the video file
280 cachedir: Location of the cache files in the filesystem.
281 False to disable filesystem cache.
282 noplaylist: Download single video instead of a playlist if in doubt.
283 age_limit: An integer representing the user's age in years.
284 Unsuitable videos for the given age are skipped.
285 min_views: An integer representing the minimum view count the video
286 must have in order to not be skipped.
287 Videos without view count information are always
288 downloaded. None for no limit.
289 max_views: An integer representing the maximum view count.
290 Videos that are more popular than that are not
291 downloaded.
292 Videos without view count information are always
293 downloaded. None for no limit.
294 download_archive: File name of a file where all downloads are recorded.
295 Videos already present in the file are not downloaded
296 again.
297 break_on_existing: Stop the download process after attempting to download a
298 file that is in the archive.
299 break_on_reject: Stop the download process when encountering a video that
300 has been filtered out.
301 cookiefile: File name where cookies should be read from and dumped to
302 cookiesfrombrowser: A tuple containing the name of the browser and the profile
303 name/path from where cookies are loaded.
304 Eg: ('chrome', ) or (vivaldi, 'default')
305 nocheckcertificate:Do not verify SSL certificates
306 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
307 At the moment, this is only supported by YouTube.
308 proxy: URL of the proxy server to use
309 geo_verification_proxy: URL of the proxy to use for IP address verification
310 on geo-restricted sites.
311 socket_timeout: Time to wait for unresponsive hosts, in seconds
312 bidi_workaround: Work around buggy terminals without bidirectional text
313 support, using fridibi
314 debug_printtraffic:Print out sent and received HTTP traffic
315 include_ads: Download ads as well
316 default_search: Prepend this string if an input url is not valid.
317 'auto' for elaborate guessing
318 encoding: Use this encoding instead of the system-specified.
319 extract_flat: Do not resolve URLs, return the immediate result.
320 Pass in 'in_playlist' to only show this behavior for
321 playlist items.
322 postprocessors: A list of dictionaries, each with an entry
323 * key: The name of the postprocessor. See
324 yt_dlp/postprocessor/__init__.py for a list.
325 * when: When to run the postprocessor. Can be one of
326 pre_process|before_dl|post_process|after_move.
327 Assumed to be 'post_process' if not given
328 post_hooks: A list of functions that get called as the final step
329 for each video file, after all postprocessors have been
330 called. The filename will be passed as the only argument.
331 progress_hooks: A list of functions that get called on download
332 progress, with a dictionary with the entries
333 * status: One of "downloading", "error", or "finished".
334 Check this first and ignore unknown values.
335 * info_dict: The extracted info_dict
336
337 If status is one of "downloading", or "finished", the
338 following properties may also be present:
339 * filename: The final filename (always present)
340 * tmpfilename: The filename we're currently writing to
341 * downloaded_bytes: Bytes on disk
342 * total_bytes: Size of the whole file, None if unknown
343 * total_bytes_estimate: Guess of the eventual file size,
344 None if unavailable.
345 * elapsed: The number of seconds since download started.
346 * eta: The estimated time in seconds, None if unknown
347 * speed: The download speed in bytes/second, None if
348 unknown
349 * fragment_index: The counter of the currently
350 downloaded video fragment.
351 * fragment_count: The number of fragments (= individual
352 files that will be merged)
353
354 Progress hooks are guaranteed to be called at least once
355 (with status "finished") if the download is successful.
356 merge_output_format: Extension to use when merging formats.
357 final_ext: Expected final extension; used to detect when the file was
358 already downloaded and converted. "merge_output_format" is
359 replaced by this extension when given
360 fixup: Automatically correct known faults of the file.
361 One of:
362 - "never": do nothing
363 - "warn": only emit a warning
364 - "detect_or_warn": check whether we can do anything
365 about it, warn otherwise (default)
366 source_address: Client-side IP address to bind to.
367 call_home: Boolean, true iff we are allowed to contact the
368 yt-dlp servers for debugging. (BROKEN)
369 sleep_interval_requests: Number of seconds to sleep between requests
370 during extraction
371 sleep_interval: Number of seconds to sleep before each download when
372 used alone or a lower bound of a range for randomized
373 sleep before each download (minimum possible number
374 of seconds to sleep) when used along with
375 max_sleep_interval.
376 max_sleep_interval:Upper bound of a range for randomized sleep before each
377 download (maximum possible number of seconds to sleep).
378 Must only be used along with sleep_interval.
379 Actual sleep time will be a random float from range
380 [sleep_interval; max_sleep_interval].
381 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
382 listformats: Print an overview of available video formats and exit.
383 list_thumbnails: Print a table of all thumbnails and exit.
384 match_filter: A function that gets called with the info_dict of
385 every video.
386 If it returns a message, the video is ignored.
387 If it returns None, the video is downloaded.
388 match_filter_func in utils.py is one example for this.
389 no_color: Do not emit color codes in output.
390 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
391 HTTP header
392 geo_bypass_country:
393 Two-letter ISO 3166-2 country code that will be used for
394 explicit geographic restriction bypassing via faking
395 X-Forwarded-For HTTP header
396 geo_bypass_ip_block:
397 IP range in CIDR notation that will be used similarly to
398 geo_bypass_country
399
400 The following options determine which downloader is picked:
401 external_downloader: A dictionary of protocol keys and the executable of the
402 external downloader to use for it. The allowed protocols
403 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
404 Set the value to 'native' to use the native downloader
405 hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
406 or {'m3u8': 'ffmpeg'} instead.
407 Use the native HLS downloader instead of ffmpeg/avconv
408 if True, otherwise use ffmpeg/avconv if False, otherwise
409 use downloader suggested by extractor if None.
410 compat_opts: Compatibility options. See "Differences in default behavior".
411 The following options do not work when used through the API:
412 filename, abort-on-error, multistreams, no-live-chat,
413 no-clean-infojson, no-playlist-metafiles, no-keep-subs.
414 Refer __init__.py for their implementation
415
416 The following parameters are not used by YoutubeDL itself, they are used by
417 the downloader (see yt_dlp/downloader/common.py):
418 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
419 max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle,
420 xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size.
421
422 The following options are used by the post processors:
423 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
424 otherwise prefer ffmpeg. (avconv support is deprecated)
425 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
426 to the binary or its containing directory.
427 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
428 and a list of additional command-line arguments for the
429 postprocessor/executable. The dict can also have "PP+EXE" keys
430 which are used when the given exe is used by the given PP.
431 Use 'default' as the name for arguments to passed to all PP
432 For compatibility with youtube-dl, a single list of args
433 can also be used
434
435 The following options are used by the extractors:
436 extractor_retries: Number of times to retry for known errors
437 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
438 hls_split_discontinuity: Split HLS playlists to different formats at
439 discontinuities such as ad breaks (default: False)
440 extractor_args: A dictionary of arguments to be passed to the extractors.
441 See "EXTRACTOR ARGUMENTS" for details.
442 Eg: {'youtube': {'skip': ['dash', 'hls']}}
443 youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
444 If True (default), DASH manifests and related
445 data will be downloaded and processed by extractor.
446 You can reduce network I/O by disabling it if you don't
447 care about DASH. (only for youtube)
448 youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
449 If True (default), HLS manifests and related
450 data will be downloaded and processed by extractor.
451 You can reduce network I/O by disabling it if you don't
452 care about HLS. (only for youtube)
453 """
454
455 _NUMERIC_FIELDS = set((
456 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
457 'timestamp', 'release_timestamp',
458 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
459 'average_rating', 'comment_count', 'age_limit',
460 'start_time', 'end_time',
461 'chapter_number', 'season_number', 'episode_number',
462 'track_number', 'disc_number', 'release_year',
463 ))
464
465 params = None
466 _ies = {}
467 _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
468 _printed_messages = set()
469 _first_webpage_request = True
470 _download_retcode = None
471 _num_downloads = None
472 _playlist_level = 0
473 _playlist_urls = set()
474 _screen_file = None
475
476 def __init__(self, params=None, auto_init=True):
477 """Create a FileDownloader object with the given options."""
478 if params is None:
479 params = {}
480 self._ies = {}
481 self._ies_instances = {}
482 self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
483 self._printed_messages = set()
484 self._first_webpage_request = True
485 self._post_hooks = []
486 self._progress_hooks = []
487 self._download_retcode = 0
488 self._num_downloads = 0
489 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
490 self._err_file = sys.stderr
491 self.params = {
492 # Default parameters
493 'nocheckcertificate': False,
494 }
495 self.params.update(params)
496 self.cache = Cache(self)
497
498 if sys.version_info < (3, 6):
499 self.report_warning(
500 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
501
502 if self.params.get('allow_unplayable_formats'):
503 self.report_warning(
504 'You have asked for unplayable formats to be listed/downloaded. '
505 'This is a developer option intended for debugging. '
506 'If you experience any issues while using this option, DO NOT open a bug report')
507
508 def check_deprecated(param, option, suggestion):
509 if self.params.get(param) is not None:
510 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
511 return True
512 return False
513
514 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
515 if self.params.get('geo_verification_proxy') is None:
516 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
517
518 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
519 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
520 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
521
522 for msg in self.params.get('warnings', []):
523 self.report_warning(msg)
524
525 if self.params.get('overwrites') is None:
526 self.params.pop('overwrites', None)
527 elif self.params.get('nooverwrites') is not None:
528 # nooverwrites was unnecessarily changed to overwrites
529 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
530 # This ensures compatibility with both keys
531 self.params['overwrites'] = not self.params['nooverwrites']
532 else:
533 self.params['nooverwrites'] = not self.params['overwrites']
534
535 if params.get('bidi_workaround', False):
536 try:
537 import pty
538 master, slave = pty.openpty()
539 width = compat_get_terminal_size().columns
540 if width is None:
541 width_args = []
542 else:
543 width_args = ['-w', str(width)]
544 sp_kwargs = dict(
545 stdin=subprocess.PIPE,
546 stdout=slave,
547 stderr=self._err_file)
548 try:
549 self._output_process = subprocess.Popen(
550 ['bidiv'] + width_args, **sp_kwargs
551 )
552 except OSError:
553 self._output_process = subprocess.Popen(
554 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
555 self._output_channel = os.fdopen(master, 'rb')
556 except OSError as ose:
557 if ose.errno == errno.ENOENT:
558 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
559 else:
560 raise
561
562 if (sys.platform != 'win32'
563 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
564 and not params.get('restrictfilenames', False)):
565 # Unicode filesystem API will throw errors (#1474, #13027)
566 self.report_warning(
567 'Assuming --restrict-filenames since file system encoding '
568 'cannot encode all characters. '
569 'Set the LC_ALL environment variable to fix this.')
570 self.params['restrictfilenames'] = True
571
572 self.outtmpl_dict = self.parse_outtmpl()
573
574 # Creating format selector here allows us to catch syntax errors before the extraction
575 self.format_selector = (
576 None if self.params.get('format') is None
577 else self.build_format_selector(self.params['format']))
578
579 self._setup_opener()
580
581 def preload_download_archive(fn):
582 """Preload the archive, if any is specified"""
583 if fn is None:
584 return False
585 self.write_debug('Loading archive file %r\n' % fn)
586 try:
587 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
588 for line in archive_file:
589 self.archive.add(line.strip())
590 except IOError as ioe:
591 if ioe.errno != errno.ENOENT:
592 raise
593 return False
594 return True
595
596 self.archive = set()
597 preload_download_archive(self.params.get('download_archive'))
598
599 if auto_init:
600 self.print_debug_header()
601 self.add_default_info_extractors()
602
603 for pp_def_raw in self.params.get('postprocessors', []):
604 pp_def = dict(pp_def_raw)
605 when = pp_def.pop('when', 'post_process')
606 pp_class = get_postprocessor(pp_def.pop('key'))
607 pp = pp_class(self, **compat_kwargs(pp_def))
608 self.add_post_processor(pp, when=when)
609
610 for ph in self.params.get('post_hooks', []):
611 self.add_post_hook(ph)
612
613 for ph in self.params.get('progress_hooks', []):
614 self.add_progress_hook(ph)
615
616 register_socks_protocols()
617
618 def warn_if_short_id(self, argv):
619 # short YouTube ID starting with dash?
620 idxs = [
621 i for i, a in enumerate(argv)
622 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
623 if idxs:
624 correct_argv = (
625 ['yt-dlp']
626 + [a for i, a in enumerate(argv) if i not in idxs]
627 + ['--'] + [argv[i] for i in idxs]
628 )
629 self.report_warning(
630 'Long argument string detected. '
631 'Use -- to separate parameters and URLs, like this:\n%s\n' %
632 args_to_str(correct_argv))
633
634 def add_info_extractor(self, ie):
635 """Add an InfoExtractor object to the end of the list."""
636 ie_key = ie.ie_key()
637 self._ies[ie_key] = ie
638 if not isinstance(ie, type):
639 self._ies_instances[ie_key] = ie
640 ie.set_downloader(self)
641
642 def _get_info_extractor_class(self, ie_key):
643 ie = self._ies.get(ie_key)
644 if ie is None:
645 ie = get_info_extractor(ie_key)
646 self.add_info_extractor(ie)
647 return ie
648
649 def get_info_extractor(self, ie_key):
650 """
651 Get an instance of an IE with name ie_key, it will try to get one from
652 the _ies list, if there's no instance it will create a new one and add
653 it to the extractor list.
654 """
655 ie = self._ies_instances.get(ie_key)
656 if ie is None:
657 ie = get_info_extractor(ie_key)()
658 self.add_info_extractor(ie)
659 return ie
660
661 def add_default_info_extractors(self):
662 """
663 Add the InfoExtractors returned by gen_extractors to the end of the list
664 """
665 for ie in gen_extractor_classes():
666 self.add_info_extractor(ie)
667
668 def add_post_processor(self, pp, when='post_process'):
669 """Add a PostProcessor object to the end of the chain."""
670 self._pps[when].append(pp)
671 pp.set_downloader(self)
672
673 def add_post_hook(self, ph):
674 """Add the post hook"""
675 self._post_hooks.append(ph)
676
677 def add_progress_hook(self, ph):
678 """Add the progress hook (currently only for the file downloader)"""
679 self._progress_hooks.append(ph)
680
681 def _bidi_workaround(self, message):
682 if not hasattr(self, '_output_channel'):
683 return message
684
685 assert hasattr(self, '_output_process')
686 assert isinstance(message, compat_str)
687 line_count = message.count('\n') + 1
688 self._output_process.stdin.write((message + '\n').encode('utf-8'))
689 self._output_process.stdin.flush()
690 res = ''.join(self._output_channel.readline().decode('utf-8')
691 for _ in range(line_count))
692 return res[:-len('\n')]
693
694 def _write_string(self, message, out=None, only_once=False):
695 if only_once:
696 if message in self._printed_messages:
697 return
698 self._printed_messages.add(message)
699 write_string(message, out=out, encoding=self.params.get('encoding'))
700
701 def to_stdout(self, message, skip_eol=False, quiet=False):
702 """Print message to stdout"""
703 if self.params.get('logger'):
704 self.params['logger'].debug(message)
705 elif not quiet or self.params.get('verbose'):
706 self._write_string(
707 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
708 self._err_file if quiet else self._screen_file)
709
710 def to_stderr(self, message, only_once=False):
711 """Print message to stderr"""
712 assert isinstance(message, compat_str)
713 if self.params.get('logger'):
714 self.params['logger'].error(message)
715 else:
716 self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
717
718 def to_console_title(self, message):
719 if not self.params.get('consoletitle', False):
720 return
721 if compat_os_name == 'nt':
722 if ctypes.windll.kernel32.GetConsoleWindow():
723 # c_wchar_p() might not be necessary if `message` is
724 # already of type unicode()
725 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
726 elif 'TERM' in os.environ:
727 self._write_string('\033]0;%s\007' % message, self._screen_file)
728
729 def save_console_title(self):
730 if not self.params.get('consoletitle', False):
731 return
732 if self.params.get('simulate'):
733 return
734 if compat_os_name != 'nt' and 'TERM' in os.environ:
735 # Save the title on stack
736 self._write_string('\033[22;0t', self._screen_file)
737
738 def restore_console_title(self):
739 if not self.params.get('consoletitle', False):
740 return
741 if self.params.get('simulate'):
742 return
743 if compat_os_name != 'nt' and 'TERM' in os.environ:
744 # Restore the title from stack
745 self._write_string('\033[23;0t', self._screen_file)
746
747 def __enter__(self):
748 self.save_console_title()
749 return self
750
751 def __exit__(self, *args):
752 self.restore_console_title()
753
754 if self.params.get('cookiefile') is not None:
755 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
756
757 def trouble(self, message=None, tb=None):
758 """Determine action to take when a download problem appears.
759
760 Depending on if the downloader has been configured to ignore
761 download errors or not, this method may throw an exception or
762 not when errors are found, after printing the message.
763
764 tb, if given, is additional traceback information.
765 """
766 if message is not None:
767 self.to_stderr(message)
768 if self.params.get('verbose'):
769 if tb is None:
770 if sys.exc_info()[0]: # if .trouble has been called from an except block
771 tb = ''
772 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
773 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
774 tb += encode_compat_str(traceback.format_exc())
775 else:
776 tb_data = traceback.format_list(traceback.extract_stack())
777 tb = ''.join(tb_data)
778 if tb:
779 self.to_stderr(tb)
780 if not self.params.get('ignoreerrors'):
781 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
782 exc_info = sys.exc_info()[1].exc_info
783 else:
784 exc_info = sys.exc_info()
785 raise DownloadError(message, exc_info)
786 self._download_retcode = 1
787
788 def to_screen(self, message, skip_eol=False):
789 """Print message to stdout if not in quiet mode"""
790 self.to_stdout(
791 message, skip_eol, quiet=self.params.get('quiet', False))
792
793 def report_warning(self, message, only_once=False):
794 '''
795 Print the message to stderr, it will be prefixed with 'WARNING:'
796 If stderr is a tty file the 'WARNING:' will be colored
797 '''
798 if self.params.get('logger') is not None:
799 self.params['logger'].warning(message)
800 else:
801 if self.params.get('no_warnings'):
802 return
803 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
804 _msg_header = '\033[0;33mWARNING:\033[0m'
805 else:
806 _msg_header = 'WARNING:'
807 warning_message = '%s %s' % (_msg_header, message)
808 self.to_stderr(warning_message, only_once)
809
810 def report_error(self, message, tb=None):
811 '''
812 Do the same as trouble, but prefixes the message with 'ERROR:', colored
813 in red if stderr is a tty file.
814 '''
815 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
816 _msg_header = '\033[0;31mERROR:\033[0m'
817 else:
818 _msg_header = 'ERROR:'
819 error_message = '%s %s' % (_msg_header, message)
820 self.trouble(error_message, tb)
821
822 def write_debug(self, message, only_once=False):
823 '''Log debug message or Print message to stderr'''
824 if not self.params.get('verbose', False):
825 return
826 message = '[debug] %s' % message
827 if self.params.get('logger'):
828 self.params['logger'].debug(message)
829 else:
830 self.to_stderr(message, only_once)
831
832 def report_file_already_downloaded(self, file_name):
833 """Report file has already been fully downloaded."""
834 try:
835 self.to_screen('[download] %s has already been downloaded' % file_name)
836 except UnicodeEncodeError:
837 self.to_screen('[download] The file has already been downloaded')
838
839 def report_file_delete(self, file_name):
840 """Report that existing file will be deleted."""
841 try:
842 self.to_screen('Deleting existing file %s' % file_name)
843 except UnicodeEncodeError:
844 self.to_screen('Deleting existing file')
845
846 def raise_no_formats(self, info, forced=False):
847 has_drm = info.get('__has_drm')
848 msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
849 expected = self.params.get('ignore_no_formats_error')
850 if forced or not expected:
851 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
852 expected=has_drm or expected)
853 else:
854 self.report_warning(msg)
855
856 def parse_outtmpl(self):
857 outtmpl_dict = self.params.get('outtmpl', {})
858 if not isinstance(outtmpl_dict, dict):
859 outtmpl_dict = {'default': outtmpl_dict}
860 outtmpl_dict.update({
861 k: v for k, v in DEFAULT_OUTTMPL.items()
862 if outtmpl_dict.get(k) is None})
863 for key, val in outtmpl_dict.items():
864 if isinstance(val, bytes):
865 self.report_warning(
866 'Parameter outtmpl is bytes, but should be a unicode string. '
867 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
868 return outtmpl_dict
869
870 def get_output_path(self, dir_type='', filename=None):
871 paths = self.params.get('paths', {})
872 assert isinstance(paths, dict)
873 path = os.path.join(
874 expand_path(paths.get('home', '').strip()),
875 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
876 filename or '')
877
878 # Temporary fix for #4787
879 # 'Treat' all problem characters by passing filename through preferredencoding
880 # to workaround encoding issues with subprocess on python2 @ Windows
881 if sys.version_info < (3, 0) and sys.platform == 'win32':
882 path = encodeFilename(path, True).decode(preferredencoding())
883 return sanitize_path(path, force=self.params.get('windowsfilenames'))
884
885 @staticmethod
886 def _outtmpl_expandpath(outtmpl):
887 # expand_path translates '%%' into '%' and '$$' into '$'
888 # correspondingly that is not what we want since we need to keep
889 # '%%' intact for template dict substitution step. Working around
890 # with boundary-alike separator hack.
891 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
892 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
893
894 # outtmpl should be expand_path'ed before template dict substitution
895 # because meta fields may contain env variables we don't want to
896 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
897 # title "Hello $PATH", we don't want `$PATH` to be expanded.
898 return expand_path(outtmpl).replace(sep, '')
899
900 @staticmethod
901 def escape_outtmpl(outtmpl):
902 ''' Escape any remaining strings like %s, %abc% etc. '''
903 return re.sub(
904 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
905 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
906 outtmpl)
907
908 @classmethod
909 def validate_outtmpl(cls, outtmpl):
910 ''' @return None or Exception object '''
911 outtmpl = re.sub(
912 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'),
913 lambda mobj: f'{mobj.group(0)[:-1]}s',
914 cls._outtmpl_expandpath(outtmpl))
915 try:
916 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
917 return None
918 except ValueError as err:
919 return err
920
921 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
922 """ Make the template and info_dict suitable for substitution : ydl.outtmpl_escape(outtmpl) % info_dict """
923 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
924
925 info_dict = dict(info_dict) # Do not sanitize so as not to consume LazyList
926 for key in ('__original_infodict', '__postprocessors'):
927 info_dict.pop(key, None)
928 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
929 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
930 if info_dict.get('duration', None) is not None
931 else None)
932 info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
933 if info_dict.get('resolution') is None:
934 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
935
936 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
937 # of %(field)s to %(field)0Nd for backward compatibility
938 field_size_compat_map = {
939 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')),
940 'playlist_autonumber': len(str(info_dict.get('n_entries') or '')),
941 'autonumber': self.params.get('autonumber_size') or 5,
942 }
943
944 TMPL_DICT = {}
945 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]'))
946 MATH_FUNCTIONS = {
947 '+': float.__add__,
948 '-': float.__sub__,
949 }
950 # Field is of the form key1.key2...
951 # where keys (except first) can be string, int or slice
952 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
953 MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
954 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
955 INTERNAL_FORMAT_RE = re.compile(r'''(?x)
956 (?P<negate>-)?
957 (?P<fields>{field})
958 (?P<maths>(?:{math_op}{math_field})*)
959 (?:>(?P<strf_format>.+?))?
960 (?P<alternate>(?<!\\),[^|)]+)?
961 (?:\|(?P<default>.*?))?
962 $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
963
964 def _traverse_infodict(k):
965 k = k.split('.')
966 if k[0] == '':
967 k.pop(0)
968 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
969
970 def get_value(mdict):
971 # Object traversal
972 value = _traverse_infodict(mdict['fields'])
973 # Negative
974 if mdict['negate']:
975 value = float_or_none(value)
976 if value is not None:
977 value *= -1
978 # Do maths
979 offset_key = mdict['maths']
980 if offset_key:
981 value = float_or_none(value)
982 operator = None
983 while offset_key:
984 item = re.match(
985 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
986 offset_key).group(0)
987 offset_key = offset_key[len(item):]
988 if operator is None:
989 operator = MATH_FUNCTIONS[item]
990 continue
991 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
992 offset = float_or_none(item)
993 if offset is None:
994 offset = float_or_none(_traverse_infodict(item))
995 try:
996 value = operator(value, multiplier * offset)
997 except (TypeError, ZeroDivisionError):
998 return None
999 operator = None
1000 # Datetime formatting
1001 if mdict['strf_format']:
1002 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1003
1004 return value
1005
1006 na = self.params.get('outtmpl_na_placeholder', 'NA')
1007
1008 def _dumpjson_default(obj):
1009 if isinstance(obj, (set, LazyList)):
1010 return list(obj)
1011 raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
1012
1013 def create_key(outer_mobj):
1014 if not outer_mobj.group('has_key'):
1015 return f'%{outer_mobj.group(0)}'
1016 key = outer_mobj.group('key')
1017 mobj = re.match(INTERNAL_FORMAT_RE, key)
1018 initial_field = mobj.group('fields').split('.')[-1] if mobj else ''
1019 value, default = None, na
1020 while mobj:
1021 mobj = mobj.groupdict()
1022 default = mobj['default'] if mobj['default'] is not None else default
1023 value = get_value(mobj)
1024 if value is None and mobj['alternate']:
1025 mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:])
1026 else:
1027 break
1028
1029 fmt = outer_mobj.group('format')
1030 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1031 fmt = '0{:d}d'.format(field_size_compat_map[key])
1032
1033 value = default if value is None else value
1034
1035 str_fmt = f'{fmt[:-1]}s'
1036 if fmt[-1] == 'l': # list
1037 delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', '
1038 value, fmt = delim.join(variadic(value)), str_fmt
1039 elif fmt[-1] == 'j': # json
1040 value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
1041 elif fmt[-1] == 'q': # quoted
1042 value, fmt = compat_shlex_quote(str(value)), str_fmt
1043 elif fmt[-1] == 'B': # bytes
1044 value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
1045 value, fmt = value.decode('utf-8', 'ignore'), 's'
1046 elif fmt[-1] == 'U': # unicode normalized
1047 opts = outer_mobj.group('conversion') or ''
1048 value, fmt = unicodedata.normalize(
1049 # "+" = compatibility equivalence, "#" = NFD
1050 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'),
1051 value), str_fmt
1052 elif fmt[-1] == 'c':
1053 if value:
1054 value = str(value)[0]
1055 else:
1056 fmt = str_fmt
1057 elif fmt[-1] not in 'rs': # numeric
1058 value = float_or_none(value)
1059 if value is None:
1060 value, fmt = default, 's'
1061
1062 if sanitize:
1063 if fmt[-1] == 'r':
1064 # If value is an object, sanitize might convert it to a string
1065 # So we convert it to repr first
1066 value, fmt = repr(value), str_fmt
1067 if fmt[-1] in 'csr':
1068 value = sanitize(initial_field, value)
1069
1070 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1071 TMPL_DICT[key] = value
1072 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1073
1074 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1075
1076 def _prepare_filename(self, info_dict, tmpl_type='default'):
1077 try:
1078 sanitize = lambda k, v: sanitize_filename(
1079 compat_str(v),
1080 restricted=self.params.get('restrictfilenames'),
1081 is_id=(k == 'id' or k.endswith('_id')))
1082 outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
1083 outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
1084 outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl))
1085 filename = outtmpl % template_dict
1086
1087 force_ext = OUTTMPL_TYPES.get(tmpl_type)
1088 if filename and force_ext is not None:
1089 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1090
1091 # https://github.com/blackjack4494/youtube-dlc/issues/85
1092 trim_file_name = self.params.get('trim_file_name', False)
1093 if trim_file_name:
1094 fn_groups = filename.rsplit('.')
1095 ext = fn_groups[-1]
1096 sub_ext = ''
1097 if len(fn_groups) > 2:
1098 sub_ext = fn_groups[-2]
1099 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
1100
1101 return filename
1102 except ValueError as err:
1103 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1104 return None
1105
1106 def prepare_filename(self, info_dict, dir_type='', warn=False):
1107 """Generate the output filename."""
1108
1109 filename = self._prepare_filename(info_dict, dir_type or 'default')
1110 if not filename and dir_type not in ('', 'temp'):
1111 return ''
1112
1113 if warn:
1114 if not self.params.get('paths'):
1115 pass
1116 elif filename == '-':
1117 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1118 elif os.path.isabs(filename):
1119 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1120 if filename == '-' or not filename:
1121 return filename
1122
1123 return self.get_output_path(dir_type, filename)
1124
1125 def _match_entry(self, info_dict, incomplete=False, silent=False):
1126 """ Returns None if the file should be downloaded """
1127
1128 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1129
1130 def check_filter():
1131 if 'title' in info_dict:
1132 # This can happen when we're just evaluating the playlist
1133 title = info_dict['title']
1134 matchtitle = self.params.get('matchtitle', False)
1135 if matchtitle:
1136 if not re.search(matchtitle, title, re.IGNORECASE):
1137 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1138 rejecttitle = self.params.get('rejecttitle', False)
1139 if rejecttitle:
1140 if re.search(rejecttitle, title, re.IGNORECASE):
1141 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1142 date = info_dict.get('upload_date')
1143 if date is not None:
1144 dateRange = self.params.get('daterange', DateRange())
1145 if date not in dateRange:
1146 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1147 view_count = info_dict.get('view_count')
1148 if view_count is not None:
1149 min_views = self.params.get('min_views')
1150 if min_views is not None and view_count < min_views:
1151 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1152 max_views = self.params.get('max_views')
1153 if max_views is not None and view_count > max_views:
1154 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1155 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1156 return 'Skipping "%s" because it is age restricted' % video_title
1157
1158 match_filter = self.params.get('match_filter')
1159 if match_filter is not None:
1160 try:
1161 ret = match_filter(info_dict, incomplete=incomplete)
1162 except TypeError:
1163 # For backward compatibility
1164 ret = None if incomplete else match_filter(info_dict)
1165 if ret is not None:
1166 return ret
1167 return None
1168
1169 if self.in_download_archive(info_dict):
1170 reason = '%s has already been recorded in the archive' % video_title
1171 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1172 else:
1173 reason = check_filter()
1174 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1175 if reason is not None:
1176 if not silent:
1177 self.to_screen('[download] ' + reason)
1178 if self.params.get(break_opt, False):
1179 raise break_err()
1180 return reason
1181
1182 @staticmethod
1183 def add_extra_info(info_dict, extra_info):
1184 '''Set the keys from extra_info in info dict if they are missing'''
1185 for key, value in extra_info.items():
1186 info_dict.setdefault(key, value)
1187
1188 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1189 process=True, force_generic_extractor=False):
1190 """
1191 Return a list with a dictionary for each video extracted.
1192
1193 Arguments:
1194 url -- URL to extract
1195
1196 Keyword arguments:
1197 download -- whether to download videos during extraction
1198 ie_key -- extractor key hint
1199 extra_info -- dictionary containing the extra values to add to each result
1200 process -- whether to resolve all unresolved references (URLs, playlist items),
1201 must be True for download to work.
1202 force_generic_extractor -- force using the generic extractor
1203 """
1204
1205 if extra_info is None:
1206 extra_info = {}
1207
1208 if not ie_key and force_generic_extractor:
1209 ie_key = 'Generic'
1210
1211 if ie_key:
1212 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1213 else:
1214 ies = self._ies
1215
1216 for ie_key, ie in ies.items():
1217 if not ie.suitable(url):
1218 continue
1219
1220 if not ie.working():
1221 self.report_warning('The program functionality for this site has been marked as broken, '
1222 'and will probably not work.')
1223
1224 temp_id = ie.get_temp_id(url)
1225 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1226 self.to_screen("[%s] %s: has already been recorded in archive" % (
1227 ie_key, temp_id))
1228 break
1229 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1230 else:
1231 self.report_error('no suitable InfoExtractor for URL %s' % url)
1232
1233 def __handle_extraction_exceptions(func):
1234
1235 def wrapper(self, *args, **kwargs):
1236 try:
1237 return func(self, *args, **kwargs)
1238 except GeoRestrictedError as e:
1239 msg = e.msg
1240 if e.countries:
1241 msg += '\nThis video is available in %s.' % ', '.join(
1242 map(ISO3166Utils.short2full, e.countries))
1243 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1244 self.report_error(msg)
1245 except ExtractorError as e: # An error we somewhat expected
1246 self.report_error(compat_str(e), e.format_traceback())
1247 except ThrottledDownload:
1248 self.to_stderr('\r')
1249 self.report_warning('The download speed is below throttle limit. Re-extracting data')
1250 return wrapper(self, *args, **kwargs)
1251 except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError):
1252 raise
1253 except Exception as e:
1254 if self.params.get('ignoreerrors'):
1255 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1256 else:
1257 raise
1258 return wrapper
1259
1260 @__handle_extraction_exceptions
1261 def __extract_info(self, url, ie, download, extra_info, process):
1262 ie_result = ie.extract(url)
1263 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1264 return
1265 if isinstance(ie_result, list):
1266 # Backwards compatibility: old IE result format
1267 ie_result = {
1268 '_type': 'compat_list',
1269 'entries': ie_result,
1270 }
1271 if extra_info.get('original_url'):
1272 ie_result.setdefault('original_url', extra_info['original_url'])
1273 self.add_default_extra_info(ie_result, ie, url)
1274 if process:
1275 return self.process_ie_result(ie_result, download, extra_info)
1276 else:
1277 return ie_result
1278
1279 def add_default_extra_info(self, ie_result, ie, url):
1280 if url is not None:
1281 self.add_extra_info(ie_result, {
1282 'webpage_url': url,
1283 'original_url': url,
1284 'webpage_url_basename': url_basename(url),
1285 })
1286 if ie is not None:
1287 self.add_extra_info(ie_result, {
1288 'extractor': ie.IE_NAME,
1289 'extractor_key': ie.ie_key(),
1290 })
1291
1292 def process_ie_result(self, ie_result, download=True, extra_info=None):
1293 """
1294 Take the result of the ie(may be modified) and resolve all unresolved
1295 references (URLs, playlist items).
1296
1297 It will also download the videos if 'download'.
1298 Returns the resolved ie_result.
1299 """
1300 if extra_info is None:
1301 extra_info = {}
1302 result_type = ie_result.get('_type', 'video')
1303
1304 if result_type in ('url', 'url_transparent'):
1305 ie_result['url'] = sanitize_url(ie_result['url'])
1306 if ie_result.get('original_url'):
1307 extra_info.setdefault('original_url', ie_result['original_url'])
1308
1309 extract_flat = self.params.get('extract_flat', False)
1310 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1311 or extract_flat is True):
1312 info_copy = ie_result.copy()
1313 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1314 if ie and not ie_result.get('id'):
1315 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1316 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1317 self.add_extra_info(info_copy, extra_info)
1318 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1319 if self.params.get('force_write_download_archive', False):
1320 self.record_download_archive(info_copy)
1321 return ie_result
1322
1323 if result_type == 'video':
1324 self.add_extra_info(ie_result, extra_info)
1325 ie_result = self.process_video_result(ie_result, download=download)
1326 additional_urls = (ie_result or {}).get('additional_urls')
1327 if additional_urls:
1328 # TODO: Improve MetadataParserPP to allow setting a list
1329 if isinstance(additional_urls, compat_str):
1330 additional_urls = [additional_urls]
1331 self.to_screen(
1332 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1333 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1334 ie_result['additional_entries'] = [
1335 self.extract_info(
1336 url, download, extra_info,
1337 force_generic_extractor=self.params.get('force_generic_extractor'))
1338 for url in additional_urls
1339 ]
1340 return ie_result
1341 elif result_type == 'url':
1342 # We have to add extra_info to the results because it may be
1343 # contained in a playlist
1344 return self.extract_info(
1345 ie_result['url'], download,
1346 ie_key=ie_result.get('ie_key'),
1347 extra_info=extra_info)
1348 elif result_type == 'url_transparent':
1349 # Use the information from the embedding page
1350 info = self.extract_info(
1351 ie_result['url'], ie_key=ie_result.get('ie_key'),
1352 extra_info=extra_info, download=False, process=False)
1353
1354 # extract_info may return None when ignoreerrors is enabled and
1355 # extraction failed with an error, don't crash and return early
1356 # in this case
1357 if not info:
1358 return info
1359
1360 force_properties = dict(
1361 (k, v) for k, v in ie_result.items() if v is not None)
1362 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1363 if f in force_properties:
1364 del force_properties[f]
1365 new_result = info.copy()
1366 new_result.update(force_properties)
1367
1368 # Extracted info may not be a video result (i.e.
1369 # info.get('_type', 'video') != video) but rather an url or
1370 # url_transparent. In such cases outer metadata (from ie_result)
1371 # should be propagated to inner one (info). For this to happen
1372 # _type of info should be overridden with url_transparent. This
1373 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1374 if new_result.get('_type') == 'url':
1375 new_result['_type'] = 'url_transparent'
1376
1377 return self.process_ie_result(
1378 new_result, download=download, extra_info=extra_info)
1379 elif result_type in ('playlist', 'multi_video'):
1380 # Protect from infinite recursion due to recursively nested playlists
1381 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1382 webpage_url = ie_result['webpage_url']
1383 if webpage_url in self._playlist_urls:
1384 self.to_screen(
1385 '[download] Skipping already downloaded playlist: %s'
1386 % ie_result.get('title') or ie_result.get('id'))
1387 return
1388
1389 self._playlist_level += 1
1390 self._playlist_urls.add(webpage_url)
1391 self._sanitize_thumbnails(ie_result)
1392 try:
1393 return self.__process_playlist(ie_result, download)
1394 finally:
1395 self._playlist_level -= 1
1396 if not self._playlist_level:
1397 self._playlist_urls.clear()
1398 elif result_type == 'compat_list':
1399 self.report_warning(
1400 'Extractor %s returned a compat_list result. '
1401 'It needs to be updated.' % ie_result.get('extractor'))
1402
1403 def _fixup(r):
1404 self.add_extra_info(r, {
1405 'extractor': ie_result['extractor'],
1406 'webpage_url': ie_result['webpage_url'],
1407 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1408 'extractor_key': ie_result['extractor_key'],
1409 })
1410 return r
1411 ie_result['entries'] = [
1412 self.process_ie_result(_fixup(r), download, extra_info)
1413 for r in ie_result['entries']
1414 ]
1415 return ie_result
1416 else:
1417 raise Exception('Invalid result type: %s' % result_type)
1418
1419 def _ensure_dir_exists(self, path):
1420 return make_dir(path, self.report_error)
1421
1422 def __process_playlist(self, ie_result, download):
1423 # We process each entry in the playlist
1424 playlist = ie_result.get('title') or ie_result.get('id')
1425 self.to_screen('[download] Downloading playlist: %s' % playlist)
1426
1427 if 'entries' not in ie_result:
1428 raise EntryNotInPlaylist()
1429 incomplete_entries = bool(ie_result.get('requested_entries'))
1430 if incomplete_entries:
1431 def fill_missing_entries(entries, indexes):
1432 ret = [None] * max(*indexes)
1433 for i, entry in zip(indexes, entries):
1434 ret[i - 1] = entry
1435 return ret
1436 ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1437
1438 playlist_results = []
1439
1440 playliststart = self.params.get('playliststart', 1)
1441 playlistend = self.params.get('playlistend')
1442 # For backwards compatibility, interpret -1 as whole list
1443 if playlistend == -1:
1444 playlistend = None
1445
1446 playlistitems_str = self.params.get('playlist_items')
1447 playlistitems = None
1448 if playlistitems_str is not None:
1449 def iter_playlistitems(format):
1450 for string_segment in format.split(','):
1451 if '-' in string_segment:
1452 start, end = string_segment.split('-')
1453 for item in range(int(start), int(end) + 1):
1454 yield int(item)
1455 else:
1456 yield int(string_segment)
1457 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1458
1459 ie_entries = ie_result['entries']
1460 msg = (
1461 'Downloading %d videos' if not isinstance(ie_entries, list)
1462 else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1463
1464 if isinstance(ie_entries, list):
1465 def get_entry(i):
1466 return ie_entries[i - 1]
1467 else:
1468 if not isinstance(ie_entries, PagedList):
1469 ie_entries = LazyList(ie_entries)
1470
1471 def get_entry(i):
1472 return YoutubeDL.__handle_extraction_exceptions(
1473 lambda self, i: ie_entries[i - 1]
1474 )(self, i)
1475
1476 entries = []
1477 items = playlistitems if playlistitems is not None else itertools.count(playliststart)
1478 for i in items:
1479 if i == 0:
1480 continue
1481 if playlistitems is None and playlistend is not None and playlistend < i:
1482 break
1483 entry = None
1484 try:
1485 entry = get_entry(i)
1486 if entry is None:
1487 raise EntryNotInPlaylist()
1488 except (IndexError, EntryNotInPlaylist):
1489 if incomplete_entries:
1490 raise EntryNotInPlaylist()
1491 elif not playlistitems:
1492 break
1493 entries.append(entry)
1494 try:
1495 if entry is not None:
1496 self._match_entry(entry, incomplete=True, silent=True)
1497 except (ExistingVideoReached, RejectedVideoReached):
1498 break
1499 ie_result['entries'] = entries
1500
1501 # Save playlist_index before re-ordering
1502 entries = [
1503 ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry)
1504 for i, entry in enumerate(entries, 1)
1505 if entry is not None]
1506 n_entries = len(entries)
1507
1508 if not playlistitems and (playliststart or playlistend):
1509 playlistitems = list(range(playliststart, playliststart + n_entries))
1510 ie_result['requested_entries'] = playlistitems
1511
1512 if self.params.get('allow_playlist_files', True):
1513 ie_copy = {
1514 'playlist': playlist,
1515 'playlist_id': ie_result.get('id'),
1516 'playlist_title': ie_result.get('title'),
1517 'playlist_uploader': ie_result.get('uploader'),
1518 'playlist_uploader_id': ie_result.get('uploader_id'),
1519 'playlist_index': 0,
1520 }
1521 ie_copy.update(dict(ie_result))
1522
1523 if self._write_info_json('playlist', ie_result,
1524 self.prepare_filename(ie_copy, 'pl_infojson')) is None:
1525 return
1526 if self._write_description('playlist', ie_result,
1527 self.prepare_filename(ie_copy, 'pl_description')) is None:
1528 return
1529 # TODO: This should be passed to ThumbnailsConvertor if necessary
1530 self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1531
1532 if self.params.get('playlistreverse', False):
1533 entries = entries[::-1]
1534 if self.params.get('playlistrandom', False):
1535 random.shuffle(entries)
1536
1537 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1538
1539 self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1540 failures = 0
1541 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1542 for i, entry_tuple in enumerate(entries, 1):
1543 playlist_index, entry = entry_tuple
1544 if 'playlist-index' in self.params.get('compat_opts', []):
1545 playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1
1546 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1547 # This __x_forwarded_for_ip thing is a bit ugly but requires
1548 # minimal changes
1549 if x_forwarded_for:
1550 entry['__x_forwarded_for_ip'] = x_forwarded_for
1551 extra = {
1552 'n_entries': n_entries,
1553 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1554 'playlist_index': playlist_index,
1555 'playlist_autonumber': i,
1556 'playlist': playlist,
1557 'playlist_id': ie_result.get('id'),
1558 'playlist_title': ie_result.get('title'),
1559 'playlist_uploader': ie_result.get('uploader'),
1560 'playlist_uploader_id': ie_result.get('uploader_id'),
1561 'extractor': ie_result['extractor'],
1562 'webpage_url': ie_result['webpage_url'],
1563 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1564 'extractor_key': ie_result['extractor_key'],
1565 }
1566
1567 if self._match_entry(entry, incomplete=True) is not None:
1568 continue
1569
1570 entry_result = self.__process_iterable_entry(entry, download, extra)
1571 if not entry_result:
1572 failures += 1
1573 if failures >= max_failures:
1574 self.report_error(
1575 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1576 break
1577 # TODO: skip failed (empty) entries?
1578 playlist_results.append(entry_result)
1579 ie_result['entries'] = playlist_results
1580 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1581 return ie_result
1582
1583 @__handle_extraction_exceptions
1584 def __process_iterable_entry(self, entry, download, extra_info):
1585 return self.process_ie_result(
1586 entry, download=download, extra_info=extra_info)
1587
1588 def _build_format_filter(self, filter_spec):
1589 " Returns a function to filter the formats according to the filter_spec "
1590
1591 OPERATORS = {
1592 '<': operator.lt,
1593 '<=': operator.le,
1594 '>': operator.gt,
1595 '>=': operator.ge,
1596 '=': operator.eq,
1597 '!=': operator.ne,
1598 }
1599 operator_rex = re.compile(r'''(?x)\s*
1600 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1601 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1602 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1603 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1604 m = operator_rex.fullmatch(filter_spec)
1605 if m:
1606 try:
1607 comparison_value = int(m.group('value'))
1608 except ValueError:
1609 comparison_value = parse_filesize(m.group('value'))
1610 if comparison_value is None:
1611 comparison_value = parse_filesize(m.group('value') + 'B')
1612 if comparison_value is None:
1613 raise ValueError(
1614 'Invalid value %r in format specification %r' % (
1615 m.group('value'), filter_spec))
1616 op = OPERATORS[m.group('op')]
1617
1618 if not m:
1619 STR_OPERATORS = {
1620 '=': operator.eq,
1621 '^=': lambda attr, value: attr.startswith(value),
1622 '$=': lambda attr, value: attr.endswith(value),
1623 '*=': lambda attr, value: value in attr,
1624 }
1625 str_operator_rex = re.compile(r'''(?x)\s*
1626 (?P<key>[a-zA-Z0-9._-]+)\s*
1627 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1628 (?P<value>[a-zA-Z0-9._-]+)\s*
1629 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1630 m = str_operator_rex.fullmatch(filter_spec)
1631 if m:
1632 comparison_value = m.group('value')
1633 str_op = STR_OPERATORS[m.group('op')]
1634 if m.group('negation'):
1635 op = lambda attr, value: not str_op(attr, value)
1636 else:
1637 op = str_op
1638
1639 if not m:
1640 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1641
1642 def _filter(f):
1643 actual_value = f.get(m.group('key'))
1644 if actual_value is None:
1645 return m.group('none_inclusive')
1646 return op(actual_value, comparison_value)
1647 return _filter
1648
1649 def _default_format_spec(self, info_dict, download=True):
1650
1651 def can_merge():
1652 merger = FFmpegMergerPP(self)
1653 return merger.available and merger.can_merge()
1654
1655 prefer_best = (
1656 not self.params.get('simulate')
1657 and download
1658 and (
1659 not can_merge()
1660 or info_dict.get('is_live', False)
1661 or self.outtmpl_dict['default'] == '-'))
1662 compat = (
1663 prefer_best
1664 or self.params.get('allow_multiple_audio_streams', False)
1665 or 'format-spec' in self.params.get('compat_opts', []))
1666
1667 return (
1668 'best/bestvideo+bestaudio' if prefer_best
1669 else 'bestvideo*+bestaudio/best' if not compat
1670 else 'bestvideo+bestaudio/best')
1671
1672 def build_format_selector(self, format_spec):
1673 def syntax_error(note, start):
1674 message = (
1675 'Invalid format specification: '
1676 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1677 return SyntaxError(message)
1678
1679 PICKFIRST = 'PICKFIRST'
1680 MERGE = 'MERGE'
1681 SINGLE = 'SINGLE'
1682 GROUP = 'GROUP'
1683 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1684
1685 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1686 'video': self.params.get('allow_multiple_video_streams', False)}
1687
1688 check_formats = self.params.get('check_formats')
1689
1690 def _parse_filter(tokens):
1691 filter_parts = []
1692 for type, string, start, _, _ in tokens:
1693 if type == tokenize.OP and string == ']':
1694 return ''.join(filter_parts)
1695 else:
1696 filter_parts.append(string)
1697
1698 def _remove_unused_ops(tokens):
1699 # Remove operators that we don't use and join them with the surrounding strings
1700 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1701 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1702 last_string, last_start, last_end, last_line = None, None, None, None
1703 for type, string, start, end, line in tokens:
1704 if type == tokenize.OP and string == '[':
1705 if last_string:
1706 yield tokenize.NAME, last_string, last_start, last_end, last_line
1707 last_string = None
1708 yield type, string, start, end, line
1709 # everything inside brackets will be handled by _parse_filter
1710 for type, string, start, end, line in tokens:
1711 yield type, string, start, end, line
1712 if type == tokenize.OP and string == ']':
1713 break
1714 elif type == tokenize.OP and string in ALLOWED_OPS:
1715 if last_string:
1716 yield tokenize.NAME, last_string, last_start, last_end, last_line
1717 last_string = None
1718 yield type, string, start, end, line
1719 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1720 if not last_string:
1721 last_string = string
1722 last_start = start
1723 last_end = end
1724 else:
1725 last_string += string
1726 if last_string:
1727 yield tokenize.NAME, last_string, last_start, last_end, last_line
1728
1729 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1730 selectors = []
1731 current_selector = None
1732 for type, string, start, _, _ in tokens:
1733 # ENCODING is only defined in python 3.x
1734 if type == getattr(tokenize, 'ENCODING', None):
1735 continue
1736 elif type in [tokenize.NAME, tokenize.NUMBER]:
1737 current_selector = FormatSelector(SINGLE, string, [])
1738 elif type == tokenize.OP:
1739 if string == ')':
1740 if not inside_group:
1741 # ')' will be handled by the parentheses group
1742 tokens.restore_last_token()
1743 break
1744 elif inside_merge and string in ['/', ',']:
1745 tokens.restore_last_token()
1746 break
1747 elif inside_choice and string == ',':
1748 tokens.restore_last_token()
1749 break
1750 elif string == ',':
1751 if not current_selector:
1752 raise syntax_error('"," must follow a format selector', start)
1753 selectors.append(current_selector)
1754 current_selector = None
1755 elif string == '/':
1756 if not current_selector:
1757 raise syntax_error('"/" must follow a format selector', start)
1758 first_choice = current_selector
1759 second_choice = _parse_format_selection(tokens, inside_choice=True)
1760 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1761 elif string == '[':
1762 if not current_selector:
1763 current_selector = FormatSelector(SINGLE, 'best', [])
1764 format_filter = _parse_filter(tokens)
1765 current_selector.filters.append(format_filter)
1766 elif string == '(':
1767 if current_selector:
1768 raise syntax_error('Unexpected "("', start)
1769 group = _parse_format_selection(tokens, inside_group=True)
1770 current_selector = FormatSelector(GROUP, group, [])
1771 elif string == '+':
1772 if not current_selector:
1773 raise syntax_error('Unexpected "+"', start)
1774 selector_1 = current_selector
1775 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1776 if not selector_2:
1777 raise syntax_error('Expected a selector', start)
1778 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1779 else:
1780 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1781 elif type == tokenize.ENDMARKER:
1782 break
1783 if current_selector:
1784 selectors.append(current_selector)
1785 return selectors
1786
1787 def _merge(formats_pair):
1788 format_1, format_2 = formats_pair
1789
1790 formats_info = []
1791 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1792 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1793
1794 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1795 get_no_more = {'video': False, 'audio': False}
1796 for (i, fmt_info) in enumerate(formats_info):
1797 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1798 formats_info.pop(i)
1799 continue
1800 for aud_vid in ['audio', 'video']:
1801 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1802 if get_no_more[aud_vid]:
1803 formats_info.pop(i)
1804 break
1805 get_no_more[aud_vid] = True
1806
1807 if len(formats_info) == 1:
1808 return formats_info[0]
1809
1810 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1811 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1812
1813 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1814 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1815
1816 output_ext = self.params.get('merge_output_format')
1817 if not output_ext:
1818 if the_only_video:
1819 output_ext = the_only_video['ext']
1820 elif the_only_audio and not video_fmts:
1821 output_ext = the_only_audio['ext']
1822 else:
1823 output_ext = 'mkv'
1824
1825 new_dict = {
1826 'requested_formats': formats_info,
1827 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1828 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1829 'ext': output_ext,
1830 }
1831
1832 if the_only_video:
1833 new_dict.update({
1834 'width': the_only_video.get('width'),
1835 'height': the_only_video.get('height'),
1836 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
1837 'fps': the_only_video.get('fps'),
1838 'vcodec': the_only_video.get('vcodec'),
1839 'vbr': the_only_video.get('vbr'),
1840 'stretched_ratio': the_only_video.get('stretched_ratio'),
1841 })
1842
1843 if the_only_audio:
1844 new_dict.update({
1845 'acodec': the_only_audio.get('acodec'),
1846 'abr': the_only_audio.get('abr'),
1847 })
1848
1849 return new_dict
1850
1851 def _check_formats(formats):
1852 if not check_formats:
1853 yield from formats
1854 return
1855 for f in formats:
1856 self.to_screen('[info] Testing format %s' % f['format_id'])
1857 temp_file = tempfile.NamedTemporaryFile(
1858 suffix='.tmp', delete=False,
1859 dir=self.get_output_path('temp') or None)
1860 temp_file.close()
1861 try:
1862 success, _ = self.dl(temp_file.name, f, test=True)
1863 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1864 success = False
1865 finally:
1866 if os.path.exists(temp_file.name):
1867 try:
1868 os.remove(temp_file.name)
1869 except OSError:
1870 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1871 if success:
1872 yield f
1873 else:
1874 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1875
1876 def _build_selector_function(selector):
1877 if isinstance(selector, list): # ,
1878 fs = [_build_selector_function(s) for s in selector]
1879
1880 def selector_function(ctx):
1881 for f in fs:
1882 yield from f(ctx)
1883 return selector_function
1884
1885 elif selector.type == GROUP: # ()
1886 selector_function = _build_selector_function(selector.selector)
1887
1888 elif selector.type == PICKFIRST: # /
1889 fs = [_build_selector_function(s) for s in selector.selector]
1890
1891 def selector_function(ctx):
1892 for f in fs:
1893 picked_formats = list(f(ctx))
1894 if picked_formats:
1895 return picked_formats
1896 return []
1897
1898 elif selector.type == MERGE: # +
1899 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1900
1901 def selector_function(ctx):
1902 for pair in itertools.product(
1903 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1904 yield _merge(pair)
1905
1906 elif selector.type == SINGLE: # atom
1907 format_spec = selector.selector or 'best'
1908
1909 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
1910 if format_spec == 'all':
1911 def selector_function(ctx):
1912 yield from _check_formats(ctx['formats'])
1913 elif format_spec == 'mergeall':
1914 def selector_function(ctx):
1915 formats = list(_check_formats(ctx['formats']))
1916 if not formats:
1917 return
1918 merged_format = formats[-1]
1919 for f in formats[-2::-1]:
1920 merged_format = _merge((merged_format, f))
1921 yield merged_format
1922
1923 else:
1924 format_fallback, format_reverse, format_idx = False, True, 1
1925 mobj = re.match(
1926 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
1927 format_spec)
1928 if mobj is not None:
1929 format_idx = int_or_none(mobj.group('n'), default=1)
1930 format_reverse = mobj.group('bw')[0] == 'b'
1931 format_type = (mobj.group('type') or [None])[0]
1932 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
1933 format_modified = mobj.group('mod') is not None
1934
1935 format_fallback = not format_type and not format_modified # for b, w
1936 _filter_f = (
1937 (lambda f: f.get('%scodec' % format_type) != 'none')
1938 if format_type and format_modified # bv*, ba*, wv*, wa*
1939 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
1940 if format_type # bv, ba, wv, wa
1941 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1942 if not format_modified # b, w
1943 else lambda f: True) # b*, w*
1944 filter_f = lambda f: _filter_f(f) and (
1945 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
1946 else:
1947 filter_f = ((lambda f: f.get('ext') == format_spec)
1948 if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension
1949 else (lambda f: f.get('format_id') == format_spec)) # id
1950
1951 def selector_function(ctx):
1952 formats = list(ctx['formats'])
1953 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1954 if format_fallback and ctx['incomplete_formats'] and not matches:
1955 # for extractors with incomplete formats (audio only (soundcloud)
1956 # or video only (imgur)) best/worst will fallback to
1957 # best/worst {video,audio}-only format
1958 matches = formats
1959 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
1960 try:
1961 yield matches[format_idx - 1]
1962 except IndexError:
1963 return
1964
1965 filters = [self._build_format_filter(f) for f in selector.filters]
1966
1967 def final_selector(ctx):
1968 ctx_copy = copy.deepcopy(ctx)
1969 for _filter in filters:
1970 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1971 return selector_function(ctx_copy)
1972 return final_selector
1973
1974 stream = io.BytesIO(format_spec.encode('utf-8'))
1975 try:
1976 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1977 except tokenize.TokenError:
1978 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1979
1980 class TokenIterator(object):
1981 def __init__(self, tokens):
1982 self.tokens = tokens
1983 self.counter = 0
1984
1985 def __iter__(self):
1986 return self
1987
1988 def __next__(self):
1989 if self.counter >= len(self.tokens):
1990 raise StopIteration()
1991 value = self.tokens[self.counter]
1992 self.counter += 1
1993 return value
1994
1995 next = __next__
1996
1997 def restore_last_token(self):
1998 self.counter -= 1
1999
2000 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2001 return _build_selector_function(parsed_selector)
2002
2003 def _calc_headers(self, info_dict):
2004 res = std_headers.copy()
2005
2006 add_headers = info_dict.get('http_headers')
2007 if add_headers:
2008 res.update(add_headers)
2009
2010 cookies = self._calc_cookies(info_dict)
2011 if cookies:
2012 res['Cookie'] = cookies
2013
2014 if 'X-Forwarded-For' not in res:
2015 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2016 if x_forwarded_for_ip:
2017 res['X-Forwarded-For'] = x_forwarded_for_ip
2018
2019 return res
2020
2021 def _calc_cookies(self, info_dict):
2022 pr = sanitized_Request(info_dict['url'])
2023 self.cookiejar.add_cookie_header(pr)
2024 return pr.get_header('Cookie')
2025
2026 def _sanitize_thumbnails(self, info_dict):
2027 thumbnails = info_dict.get('thumbnails')
2028 if thumbnails is None:
2029 thumbnail = info_dict.get('thumbnail')
2030 if thumbnail:
2031 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2032 if thumbnails:
2033 thumbnails.sort(key=lambda t: (
2034 t.get('preference') if t.get('preference') is not None else -1,
2035 t.get('width') if t.get('width') is not None else -1,
2036 t.get('height') if t.get('height') is not None else -1,
2037 t.get('id') if t.get('id') is not None else '',
2038 t.get('url')))
2039
2040 def thumbnail_tester():
2041 if self.params.get('check_formats'):
2042 test_all = True
2043 to_screen = lambda msg: self.to_screen(f'[info] {msg}')
2044 else:
2045 test_all = False
2046 to_screen = self.write_debug
2047
2048 def test_thumbnail(t):
2049 if not test_all and not t.get('_test_url'):
2050 return True
2051 to_screen('Testing thumbnail %s' % t['id'])
2052 try:
2053 self.urlopen(HEADRequest(t['url']))
2054 except network_exceptions as err:
2055 to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % (
2056 t['id'], t['url'], error_to_compat_str(err)))
2057 return False
2058 return True
2059
2060 return test_thumbnail
2061
2062 for i, t in enumerate(thumbnails):
2063 if t.get('id') is None:
2064 t['id'] = '%d' % i
2065 if t.get('width') and t.get('height'):
2066 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2067 t['url'] = sanitize_url(t['url'])
2068
2069 if self.params.get('check_formats') is not False:
2070 info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse()
2071 else:
2072 info_dict['thumbnails'] = thumbnails
2073
2074 def process_video_result(self, info_dict, download=True):
2075 assert info_dict.get('_type', 'video') == 'video'
2076
2077 if 'id' not in info_dict:
2078 raise ExtractorError('Missing "id" field in extractor result')
2079 if 'title' not in info_dict:
2080 raise ExtractorError('Missing "title" field in extractor result',
2081 video_id=info_dict['id'], ie=info_dict['extractor'])
2082
2083 def report_force_conversion(field, field_not, conversion):
2084 self.report_warning(
2085 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2086 % (field, field_not, conversion))
2087
2088 def sanitize_string_field(info, string_field):
2089 field = info.get(string_field)
2090 if field is None or isinstance(field, compat_str):
2091 return
2092 report_force_conversion(string_field, 'a string', 'string')
2093 info[string_field] = compat_str(field)
2094
2095 def sanitize_numeric_fields(info):
2096 for numeric_field in self._NUMERIC_FIELDS:
2097 field = info.get(numeric_field)
2098 if field is None or isinstance(field, compat_numeric_types):
2099 continue
2100 report_force_conversion(numeric_field, 'numeric', 'int')
2101 info[numeric_field] = int_or_none(field)
2102
2103 sanitize_string_field(info_dict, 'id')
2104 sanitize_numeric_fields(info_dict)
2105
2106 if 'playlist' not in info_dict:
2107 # It isn't part of a playlist
2108 info_dict['playlist'] = None
2109 info_dict['playlist_index'] = None
2110
2111 self._sanitize_thumbnails(info_dict)
2112
2113 thumbnail = info_dict.get('thumbnail')
2114 thumbnails = info_dict.get('thumbnails')
2115 if thumbnail:
2116 info_dict['thumbnail'] = sanitize_url(thumbnail)
2117 elif thumbnails:
2118 info_dict['thumbnail'] = thumbnails[-1]['url']
2119
2120 if info_dict.get('display_id') is None and 'id' in info_dict:
2121 info_dict['display_id'] = info_dict['id']
2122
2123 for ts_key, date_key in (
2124 ('timestamp', 'upload_date'),
2125 ('release_timestamp', 'release_date'),
2126 ):
2127 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2128 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2129 # see http://bugs.python.org/issue1646728)
2130 try:
2131 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2132 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2133 except (ValueError, OverflowError, OSError):
2134 pass
2135
2136 live_keys = ('is_live', 'was_live')
2137 live_status = info_dict.get('live_status')
2138 if live_status is None:
2139 for key in live_keys:
2140 if info_dict.get(key) is False:
2141 continue
2142 if info_dict.get(key):
2143 live_status = key
2144 break
2145 if all(info_dict.get(key) is False for key in live_keys):
2146 live_status = 'not_live'
2147 if live_status:
2148 info_dict['live_status'] = live_status
2149 for key in live_keys:
2150 if info_dict.get(key) is None:
2151 info_dict[key] = (live_status == key)
2152
2153 # Auto generate title fields corresponding to the *_number fields when missing
2154 # in order to always have clean titles. This is very common for TV series.
2155 for field in ('chapter', 'season', 'episode'):
2156 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2157 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2158
2159 for cc_kind in ('subtitles', 'automatic_captions'):
2160 cc = info_dict.get(cc_kind)
2161 if cc:
2162 for _, subtitle in cc.items():
2163 for subtitle_format in subtitle:
2164 if subtitle_format.get('url'):
2165 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2166 if subtitle_format.get('ext') is None:
2167 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2168
2169 automatic_captions = info_dict.get('automatic_captions')
2170 subtitles = info_dict.get('subtitles')
2171
2172 info_dict['requested_subtitles'] = self.process_subtitles(
2173 info_dict['id'], subtitles, automatic_captions)
2174
2175 # We now pick which formats have to be downloaded
2176 if info_dict.get('formats') is None:
2177 # There's only one format available
2178 formats = [info_dict]
2179 else:
2180 formats = info_dict['formats']
2181
2182 info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
2183 if not self.params.get('allow_unplayable_formats'):
2184 formats = [f for f in formats if not f.get('has_drm')]
2185
2186 if not formats:
2187 self.raise_no_formats(info_dict)
2188
2189 def is_wellformed(f):
2190 url = f.get('url')
2191 if not url:
2192 self.report_warning(
2193 '"url" field is missing or empty - skipping format, '
2194 'there is an error in extractor')
2195 return False
2196 if isinstance(url, bytes):
2197 sanitize_string_field(f, 'url')
2198 return True
2199
2200 # Filter out malformed formats for better extraction robustness
2201 formats = list(filter(is_wellformed, formats))
2202
2203 formats_dict = {}
2204
2205 # We check that all the formats have the format and format_id fields
2206 for i, format in enumerate(formats):
2207 sanitize_string_field(format, 'format_id')
2208 sanitize_numeric_fields(format)
2209 format['url'] = sanitize_url(format['url'])
2210 if not format.get('format_id'):
2211 format['format_id'] = compat_str(i)
2212 else:
2213 # Sanitize format_id from characters used in format selector expression
2214 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2215 format_id = format['format_id']
2216 if format_id not in formats_dict:
2217 formats_dict[format_id] = []
2218 formats_dict[format_id].append(format)
2219
2220 # Make sure all formats have unique format_id
2221 for format_id, ambiguous_formats in formats_dict.items():
2222 if len(ambiguous_formats) > 1:
2223 for i, format in enumerate(ambiguous_formats):
2224 format['format_id'] = '%s-%d' % (format_id, i)
2225
2226 for i, format in enumerate(formats):
2227 if format.get('format') is None:
2228 format['format'] = '{id} - {res}{note}'.format(
2229 id=format['format_id'],
2230 res=self.format_resolution(format),
2231 note=format_field(format, 'format_note', ' (%s)'),
2232 )
2233 # Automatically determine file extension if missing
2234 if format.get('ext') is None:
2235 format['ext'] = determine_ext(format['url']).lower()
2236 # Automatically determine protocol if missing (useful for format
2237 # selection purposes)
2238 if format.get('protocol') is None:
2239 format['protocol'] = determine_protocol(format)
2240 # Add HTTP headers, so that external programs can use them from the
2241 # json output
2242 full_format_info = info_dict.copy()
2243 full_format_info.update(format)
2244 format['http_headers'] = self._calc_headers(full_format_info)
2245 # Remove private housekeeping stuff
2246 if '__x_forwarded_for_ip' in info_dict:
2247 del info_dict['__x_forwarded_for_ip']
2248
2249 # TODO Central sorting goes here
2250
2251 if not formats or formats[0] is not info_dict:
2252 # only set the 'formats' fields if the original info_dict list them
2253 # otherwise we end up with a circular reference, the first (and unique)
2254 # element in the 'formats' field in info_dict is info_dict itself,
2255 # which can't be exported to json
2256 info_dict['formats'] = formats
2257
2258 info_dict, _ = self.pre_process(info_dict)
2259
2260 if self.params.get('list_thumbnails'):
2261 self.list_thumbnails(info_dict)
2262 if self.params.get('listformats'):
2263 if not info_dict.get('formats') and not info_dict.get('url'):
2264 self.to_screen('%s has no formats' % info_dict['id'])
2265 else:
2266 self.list_formats(info_dict)
2267 if self.params.get('listsubtitles'):
2268 if 'automatic_captions' in info_dict:
2269 self.list_subtitles(
2270 info_dict['id'], automatic_captions, 'automatic captions')
2271 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2272 list_only = self.params.get('simulate') is None and (
2273 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2274 if list_only:
2275 # Without this printing, -F --print-json will not work
2276 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2277 return
2278
2279 format_selector = self.format_selector
2280 if format_selector is None:
2281 req_format = self._default_format_spec(info_dict, download=download)
2282 self.write_debug('Default format spec: %s' % req_format)
2283 format_selector = self.build_format_selector(req_format)
2284
2285 # While in format selection we may need to have an access to the original
2286 # format set in order to calculate some metrics or do some processing.
2287 # For now we need to be able to guess whether original formats provided
2288 # by extractor are incomplete or not (i.e. whether extractor provides only
2289 # video-only or audio-only formats) for proper formats selection for
2290 # extractors with such incomplete formats (see
2291 # https://github.com/ytdl-org/youtube-dl/pull/5556).
2292 # Since formats may be filtered during format selection and may not match
2293 # the original formats the results may be incorrect. Thus original formats
2294 # or pre-calculated metrics should be passed to format selection routines
2295 # as well.
2296 # We will pass a context object containing all necessary additional data
2297 # instead of just formats.
2298 # This fixes incorrect format selection issue (see
2299 # https://github.com/ytdl-org/youtube-dl/issues/10083).
2300 incomplete_formats = (
2301 # All formats are video-only or
2302 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2303 # all formats are audio-only
2304 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2305
2306 ctx = {
2307 'formats': formats,
2308 'incomplete_formats': incomplete_formats,
2309 }
2310
2311 formats_to_download = list(format_selector(ctx))
2312 if not formats_to_download:
2313 if not self.params.get('ignore_no_formats_error'):
2314 raise ExtractorError('Requested format is not available', expected=True,
2315 video_id=info_dict['id'], ie=info_dict['extractor'])
2316 else:
2317 self.report_warning('Requested format is not available')
2318 # Process what we can, even without any available formats.
2319 self.process_info(dict(info_dict))
2320 elif download:
2321 self.to_screen(
2322 '[info] %s: Downloading %d format(s): %s' % (
2323 info_dict['id'], len(formats_to_download),
2324 ", ".join([f['format_id'] for f in formats_to_download])))
2325 for fmt in formats_to_download:
2326 new_info = dict(info_dict)
2327 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2328 new_info['__original_infodict'] = info_dict
2329 new_info.update(fmt)
2330 self.process_info(new_info)
2331 # We update the info dict with the best quality format (backwards compatibility)
2332 if formats_to_download:
2333 info_dict.update(formats_to_download[-1])
2334 return info_dict
2335
2336 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2337 """Select the requested subtitles and their format"""
2338 available_subs = {}
2339 if normal_subtitles and self.params.get('writesubtitles'):
2340 available_subs.update(normal_subtitles)
2341 if automatic_captions and self.params.get('writeautomaticsub'):
2342 for lang, cap_info in automatic_captions.items():
2343 if lang not in available_subs:
2344 available_subs[lang] = cap_info
2345
2346 if (not self.params.get('writesubtitles') and not
2347 self.params.get('writeautomaticsub') or not
2348 available_subs):
2349 return None
2350
2351 all_sub_langs = available_subs.keys()
2352 if self.params.get('allsubtitles', False):
2353 requested_langs = all_sub_langs
2354 elif self.params.get('subtitleslangs', False):
2355 # A list is used so that the order of languages will be the same as
2356 # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
2357 requested_langs = []
2358 for lang_re in self.params.get('subtitleslangs'):
2359 if lang_re == 'all':
2360 requested_langs.extend(all_sub_langs)
2361 continue
2362 discard = lang_re[0] == '-'
2363 if discard:
2364 lang_re = lang_re[1:]
2365 current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
2366 if discard:
2367 for lang in current_langs:
2368 while lang in requested_langs:
2369 requested_langs.remove(lang)
2370 else:
2371 requested_langs.extend(current_langs)
2372 requested_langs = orderedSet(requested_langs)
2373 elif 'en' in available_subs:
2374 requested_langs = ['en']
2375 else:
2376 requested_langs = [list(all_sub_langs)[0]]
2377 if requested_langs:
2378 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2379
2380 formats_query = self.params.get('subtitlesformat', 'best')
2381 formats_preference = formats_query.split('/') if formats_query else []
2382 subs = {}
2383 for lang in requested_langs:
2384 formats = available_subs.get(lang)
2385 if formats is None:
2386 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2387 continue
2388 for ext in formats_preference:
2389 if ext == 'best':
2390 f = formats[-1]
2391 break
2392 matches = list(filter(lambda f: f['ext'] == ext, formats))
2393 if matches:
2394 f = matches[-1]
2395 break
2396 else:
2397 f = formats[-1]
2398 self.report_warning(
2399 'No subtitle format found matching "%s" for language %s, '
2400 'using %s' % (formats_query, lang, f['ext']))
2401 subs[lang] = f
2402 return subs
2403
2404 def __forced_printings(self, info_dict, filename, incomplete):
2405 def print_mandatory(field, actual_field=None):
2406 if actual_field is None:
2407 actual_field = field
2408 if (self.params.get('force%s' % field, False)
2409 and (not incomplete or info_dict.get(actual_field) is not None)):
2410 self.to_stdout(info_dict[actual_field])
2411
2412 def print_optional(field):
2413 if (self.params.get('force%s' % field, False)
2414 and info_dict.get(field) is not None):
2415 self.to_stdout(info_dict[field])
2416
2417 info_dict = info_dict.copy()
2418 if filename is not None:
2419 info_dict['filename'] = filename
2420 if info_dict.get('requested_formats') is not None:
2421 # For RTMP URLs, also include the playpath
2422 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2423 elif 'url' in info_dict:
2424 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2425
2426 if self.params.get('forceprint') or self.params.get('forcejson'):
2427 self.post_extract(info_dict)
2428 for tmpl in self.params.get('forceprint', []):
2429 if re.match(r'\w+$', tmpl):
2430 tmpl = '%({})s'.format(tmpl)
2431 tmpl, info_copy = self.prepare_outtmpl(tmpl, info_dict)
2432 self.to_stdout(self.escape_outtmpl(tmpl) % info_copy)
2433
2434 print_mandatory('title')
2435 print_mandatory('id')
2436 print_mandatory('url', 'urls')
2437 print_optional('thumbnail')
2438 print_optional('description')
2439 print_optional('filename')
2440 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2441 self.to_stdout(formatSeconds(info_dict['duration']))
2442 print_mandatory('format')
2443
2444 if self.params.get('forcejson'):
2445 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2446
2447 def dl(self, name, info, subtitle=False, test=False):
2448 if not info.get('url'):
2449 self.raise_no_formats(info, True)
2450
2451 if test:
2452 verbose = self.params.get('verbose')
2453 params = {
2454 'test': True,
2455 'quiet': not verbose,
2456 'verbose': verbose,
2457 'noprogress': not verbose,
2458 'nopart': True,
2459 'skip_unavailable_fragments': False,
2460 'keep_fragments': False,
2461 'overwrites': True,
2462 '_no_ytdl_file': True,
2463 }
2464 else:
2465 params = self.params
2466 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2467 if not test:
2468 for ph in self._progress_hooks:
2469 fd.add_progress_hook(ph)
2470 urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2471 self.write_debug('Invoking downloader on "%s"' % urls)
2472 new_info = dict(info)
2473 if new_info.get('http_headers') is None:
2474 new_info['http_headers'] = self._calc_headers(new_info)
2475 return fd.download(name, new_info, subtitle)
2476
2477 def process_info(self, info_dict):
2478 """Process a single resolved IE result."""
2479
2480 assert info_dict.get('_type', 'video') == 'video'
2481
2482 max_downloads = self.params.get('max_downloads')
2483 if max_downloads is not None:
2484 if self._num_downloads >= int(max_downloads):
2485 raise MaxDownloadsReached()
2486
2487 # TODO: backward compatibility, to be removed
2488 info_dict['fulltitle'] = info_dict['title']
2489
2490 if 'format' not in info_dict and 'ext' in info_dict:
2491 info_dict['format'] = info_dict['ext']
2492
2493 if self._match_entry(info_dict) is not None:
2494 return
2495
2496 self.post_extract(info_dict)
2497 self._num_downloads += 1
2498
2499 # info_dict['_filename'] needs to be set for backward compatibility
2500 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2501 temp_filename = self.prepare_filename(info_dict, 'temp')
2502 files_to_move = {}
2503
2504 # Forced printings
2505 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2506
2507 if self.params.get('simulate'):
2508 if self.params.get('force_write_download_archive', False):
2509 self.record_download_archive(info_dict)
2510 # Do nothing else if in simulate mode
2511 return
2512
2513 if full_filename is None:
2514 return
2515 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2516 return
2517 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2518 return
2519
2520 if self._write_description('video', info_dict,
2521 self.prepare_filename(info_dict, 'description')) is None:
2522 return
2523
2524 sub_files = self._write_subtitles(info_dict, temp_filename)
2525 if sub_files is None:
2526 return
2527 files_to_move.update(dict(sub_files))
2528
2529 thumb_files = self._write_thumbnails(
2530 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
2531 if thumb_files is None:
2532 return
2533 files_to_move.update(dict(thumb_files))
2534
2535 infofn = self.prepare_filename(info_dict, 'infojson')
2536 _infojson_written = self._write_info_json('video', info_dict, infofn)
2537 if _infojson_written:
2538 info_dict['__infojson_filename'] = infofn
2539 elif _infojson_written is None:
2540 return
2541
2542 # Note: Annotations are deprecated
2543 annofn = None
2544 if self.params.get('writeannotations', False):
2545 annofn = self.prepare_filename(info_dict, 'annotation')
2546 if annofn:
2547 if not self._ensure_dir_exists(encodeFilename(annofn)):
2548 return
2549 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2550 self.to_screen('[info] Video annotations are already present')
2551 elif not info_dict.get('annotations'):
2552 self.report_warning('There are no annotations to write.')
2553 else:
2554 try:
2555 self.to_screen('[info] Writing video annotations to: ' + annofn)
2556 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2557 annofile.write(info_dict['annotations'])
2558 except (KeyError, TypeError):
2559 self.report_warning('There are no annotations to write.')
2560 except (OSError, IOError):
2561 self.report_error('Cannot write annotations file: ' + annofn)
2562 return
2563
2564 # Write internet shortcut files
2565 url_link = webloc_link = desktop_link = False
2566 if self.params.get('writelink', False):
2567 if sys.platform == "darwin": # macOS.
2568 webloc_link = True
2569 elif sys.platform.startswith("linux"):
2570 desktop_link = True
2571 else: # if sys.platform in ['win32', 'cygwin']:
2572 url_link = True
2573 if self.params.get('writeurllink', False):
2574 url_link = True
2575 if self.params.get('writewebloclink', False):
2576 webloc_link = True
2577 if self.params.get('writedesktoplink', False):
2578 desktop_link = True
2579
2580 if url_link or webloc_link or desktop_link:
2581 if 'webpage_url' not in info_dict:
2582 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2583 return
2584 ascii_url = iri_to_uri(info_dict['webpage_url'])
2585
2586 def _write_link_file(extension, template, newline, embed_filename):
2587 linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
2588 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2589 self.to_screen('[info] Internet shortcut is already present')
2590 else:
2591 try:
2592 self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
2593 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
2594 template_vars = {'url': ascii_url}
2595 if embed_filename:
2596 template_vars['filename'] = linkfn[:-(len(extension) + 1)]
2597 linkfile.write(template % template_vars)
2598 except (OSError, IOError):
2599 self.report_error('Cannot write internet shortcut ' + linkfn)
2600 return False
2601 return True
2602
2603 if url_link:
2604 if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
2605 return
2606 if webloc_link:
2607 if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
2608 return
2609 if desktop_link:
2610 if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
2611 return
2612
2613 try:
2614 info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2615 except PostProcessingError as err:
2616 self.report_error('Preprocessing: %s' % str(err))
2617 return
2618
2619 must_record_download_archive = False
2620 if self.params.get('skip_download', False):
2621 info_dict['filepath'] = temp_filename
2622 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2623 info_dict['__files_to_move'] = files_to_move
2624 info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2625 else:
2626 # Download
2627 info_dict.setdefault('__postprocessors', [])
2628 try:
2629
2630 def existing_file(*filepaths):
2631 ext = info_dict.get('ext')
2632 final_ext = self.params.get('final_ext', ext)
2633 existing_files = []
2634 for file in orderedSet(filepaths):
2635 if final_ext != ext:
2636 converted = replace_extension(file, final_ext, ext)
2637 if os.path.exists(encodeFilename(converted)):
2638 existing_files.append(converted)
2639 if os.path.exists(encodeFilename(file)):
2640 existing_files.append(file)
2641
2642 if not existing_files or self.params.get('overwrites', False):
2643 for file in orderedSet(existing_files):
2644 self.report_file_delete(file)
2645 os.remove(encodeFilename(file))
2646 return None
2647
2648 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2649 return existing_files[0]
2650
2651 success = True
2652 if info_dict.get('requested_formats') is not None:
2653
2654 def compatible_formats(formats):
2655 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2656 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2657 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2658 if len(video_formats) > 2 or len(audio_formats) > 2:
2659 return False
2660
2661 # Check extension
2662 exts = set(format.get('ext') for format in formats)
2663 COMPATIBLE_EXTS = (
2664 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2665 set(('webm',)),
2666 )
2667 for ext_sets in COMPATIBLE_EXTS:
2668 if ext_sets.issuperset(exts):
2669 return True
2670 # TODO: Check acodec/vcodec
2671 return False
2672
2673 requested_formats = info_dict['requested_formats']
2674 old_ext = info_dict['ext']
2675 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2676 info_dict['ext'] = 'mkv'
2677 self.report_warning(
2678 'Requested formats are incompatible for merge and will be merged into mkv.')
2679 new_ext = info_dict['ext']
2680
2681 def correct_ext(filename, ext=new_ext):
2682 if filename == '-':
2683 return filename
2684 filename_real_ext = os.path.splitext(filename)[1][1:]
2685 filename_wo_ext = (
2686 os.path.splitext(filename)[0]
2687 if filename_real_ext in (old_ext, new_ext)
2688 else filename)
2689 return '%s.%s' % (filename_wo_ext, ext)
2690
2691 # Ensure filename always has a correct extension for successful merge
2692 full_filename = correct_ext(full_filename)
2693 temp_filename = correct_ext(temp_filename)
2694 dl_filename = existing_file(full_filename, temp_filename)
2695 info_dict['__real_download'] = False
2696
2697 _protocols = set(determine_protocol(f) for f in requested_formats)
2698 if len(_protocols) == 1: # All requested formats have same protocol
2699 info_dict['protocol'] = _protocols.pop()
2700 directly_mergable = FFmpegFD.can_merge_formats(info_dict, self.params)
2701 if dl_filename is not None:
2702 self.report_file_already_downloaded(dl_filename)
2703 elif (directly_mergable and get_suitable_downloader(
2704 info_dict, self.params, to_stdout=(temp_filename == '-')) == FFmpegFD):
2705 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2706 success, real_download = self.dl(temp_filename, info_dict)
2707 info_dict['__real_download'] = real_download
2708 else:
2709 downloaded = []
2710 merger = FFmpegMergerPP(self)
2711 if self.params.get('allow_unplayable_formats'):
2712 self.report_warning(
2713 'You have requested merging of multiple formats '
2714 'while also allowing unplayable formats to be downloaded. '
2715 'The formats won\'t be merged to prevent data corruption.')
2716 elif not merger.available:
2717 self.report_warning(
2718 'You have requested merging of multiple formats but ffmpeg is not installed. '
2719 'The formats won\'t be merged.')
2720
2721 if temp_filename == '-':
2722 reason = ('using a downloader other than ffmpeg' if directly_mergable
2723 else 'but the formats are incompatible for simultaneous download' if merger.available
2724 else 'but ffmpeg is not installed')
2725 self.report_warning(
2726 f'You have requested downloading multiple formats to stdout {reason}. '
2727 'The formats will be streamed one after the other')
2728 fname = temp_filename
2729 for f in requested_formats:
2730 new_info = dict(info_dict)
2731 del new_info['requested_formats']
2732 new_info.update(f)
2733 if temp_filename != '-':
2734 fname = prepend_extension(
2735 correct_ext(temp_filename, new_info['ext']),
2736 'f%s' % f['format_id'], new_info['ext'])
2737 if not self._ensure_dir_exists(fname):
2738 return
2739 f['filepath'] = fname
2740 downloaded.append(fname)
2741 partial_success, real_download = self.dl(fname, new_info)
2742 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2743 success = success and partial_success
2744 if merger.available and not self.params.get('allow_unplayable_formats'):
2745 info_dict['__postprocessors'].append(merger)
2746 info_dict['__files_to_merge'] = downloaded
2747 # Even if there were no downloads, it is being merged only now
2748 info_dict['__real_download'] = True
2749 else:
2750 for file in downloaded:
2751 files_to_move[file] = None
2752 else:
2753 # Just a single file
2754 dl_filename = existing_file(full_filename, temp_filename)
2755 if dl_filename is None or dl_filename == temp_filename:
2756 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
2757 # So we should try to resume the download
2758 success, real_download = self.dl(temp_filename, info_dict)
2759 info_dict['__real_download'] = real_download
2760 else:
2761 self.report_file_already_downloaded(dl_filename)
2762
2763 dl_filename = dl_filename or temp_filename
2764 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2765
2766 except network_exceptions as err:
2767 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2768 return
2769 except (OSError, IOError) as err:
2770 raise UnavailableVideoError(err)
2771 except (ContentTooShortError, ) as err:
2772 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2773 return
2774
2775 if success and full_filename != '-':
2776
2777 def fixup():
2778 do_fixup = True
2779 fixup_policy = self.params.get('fixup')
2780 vid = info_dict['id']
2781
2782 if fixup_policy in ('ignore', 'never'):
2783 return
2784 elif fixup_policy == 'warn':
2785 do_fixup = False
2786 elif fixup_policy != 'force':
2787 assert fixup_policy in ('detect_or_warn', None)
2788 if not info_dict.get('__real_download'):
2789 do_fixup = False
2790
2791 def ffmpeg_fixup(cndn, msg, cls):
2792 if not cndn:
2793 return
2794 if not do_fixup:
2795 self.report_warning(f'{vid}: {msg}')
2796 return
2797 pp = cls(self)
2798 if pp.available:
2799 info_dict['__postprocessors'].append(pp)
2800 else:
2801 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
2802
2803 stretched_ratio = info_dict.get('stretched_ratio')
2804 ffmpeg_fixup(
2805 stretched_ratio not in (1, None),
2806 f'Non-uniform pixel ratio {stretched_ratio}',
2807 FFmpegFixupStretchedPP)
2808
2809 ffmpeg_fixup(
2810 (info_dict.get('requested_formats') is None
2811 and info_dict.get('container') == 'm4a_dash'
2812 and info_dict.get('ext') == 'm4a'),
2813 'writing DASH m4a. Only some players support this container',
2814 FFmpegFixupM4aPP)
2815
2816 downloader = (get_suitable_downloader(info_dict, self.params).__name__
2817 if 'protocol' in info_dict else None)
2818 ffmpeg_fixup(downloader == 'HlsFD', 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
2819 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
2820 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
2821
2822 fixup()
2823 try:
2824 info_dict = self.post_process(dl_filename, info_dict, files_to_move)
2825 except PostProcessingError as err:
2826 self.report_error('Postprocessing: %s' % str(err))
2827 return
2828 try:
2829 for ph in self._post_hooks:
2830 ph(info_dict['filepath'])
2831 except Exception as err:
2832 self.report_error('post hooks: %s' % str(err))
2833 return
2834 must_record_download_archive = True
2835
2836 if must_record_download_archive or self.params.get('force_write_download_archive', False):
2837 self.record_download_archive(info_dict)
2838 max_downloads = self.params.get('max_downloads')
2839 if max_downloads is not None and self._num_downloads >= int(max_downloads):
2840 raise MaxDownloadsReached()
2841
2842 def download(self, url_list):
2843 """Download a given list of URLs."""
2844 outtmpl = self.outtmpl_dict['default']
2845 if (len(url_list) > 1
2846 and outtmpl != '-'
2847 and '%' not in outtmpl
2848 and self.params.get('max_downloads') != 1):
2849 raise SameFileError(outtmpl)
2850
2851 for url in url_list:
2852 try:
2853 # It also downloads the videos
2854 res = self.extract_info(
2855 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2856 except UnavailableVideoError:
2857 self.report_error('unable to download video')
2858 except MaxDownloadsReached:
2859 self.to_screen('[info] Maximum number of downloads reached')
2860 raise
2861 except ExistingVideoReached:
2862 self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing')
2863 raise
2864 except RejectedVideoReached:
2865 self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject')
2866 raise
2867 else:
2868 if self.params.get('dump_single_json', False):
2869 self.post_extract(res)
2870 self.to_stdout(json.dumps(self.sanitize_info(res)))
2871
2872 return self._download_retcode
2873
2874 def download_with_info_file(self, info_filename):
2875 with contextlib.closing(fileinput.FileInput(
2876 [info_filename], mode='r',
2877 openhook=fileinput.hook_encoded('utf-8'))) as f:
2878 # FileInput doesn't have a read method, we can't call json.load
2879 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
2880 try:
2881 self.process_ie_result(info, download=True)
2882 except (DownloadError, EntryNotInPlaylist, ThrottledDownload):
2883 webpage_url = info.get('webpage_url')
2884 if webpage_url is not None:
2885 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2886 return self.download([webpage_url])
2887 else:
2888 raise
2889 return self._download_retcode
2890
2891 @staticmethod
2892 def sanitize_info(info_dict, remove_private_keys=False):
2893 ''' Sanitize the infodict for converting to json '''
2894 if info_dict is None:
2895 return info_dict
2896 info_dict.setdefault('epoch', int(time.time()))
2897 remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
2898 keep_keys = ['_type'], # Always keep this to facilitate load-info-json
2899 if remove_private_keys:
2900 remove_keys |= {
2901 'requested_formats', 'requested_subtitles', 'requested_entries',
2902 'filepath', 'entries', 'original_url', 'playlist_autonumber',
2903 }
2904 empty_values = (None, {}, [], set(), tuple())
2905 reject = lambda k, v: k not in keep_keys and (
2906 k.startswith('_') or k in remove_keys or v in empty_values)
2907 else:
2908 reject = lambda k, v: k in remove_keys
2909 filter_fn = lambda obj: (
2910 list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
2911 else obj if not isinstance(obj, dict)
2912 else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
2913 return filter_fn(info_dict)
2914
2915 @staticmethod
2916 def filter_requested_info(info_dict, actually_filter=True):
2917 ''' Alias of sanitize_info for backward compatibility '''
2918 return YoutubeDL.sanitize_info(info_dict, actually_filter)
2919
2920 def run_pp(self, pp, infodict):
2921 files_to_delete = []
2922 if '__files_to_move' not in infodict:
2923 infodict['__files_to_move'] = {}
2924 try:
2925 files_to_delete, infodict = pp.run(infodict)
2926 except PostProcessingError as e:
2927 # Must be True and not 'only_download'
2928 if self.params.get('ignoreerrors') is True:
2929 self.report_error(e)
2930 return infodict
2931 raise
2932
2933 if not files_to_delete:
2934 return infodict
2935 if self.params.get('keepvideo', False):
2936 for f in files_to_delete:
2937 infodict['__files_to_move'].setdefault(f, '')
2938 else:
2939 for old_filename in set(files_to_delete):
2940 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2941 try:
2942 os.remove(encodeFilename(old_filename))
2943 except (IOError, OSError):
2944 self.report_warning('Unable to remove downloaded original file')
2945 if old_filename in infodict['__files_to_move']:
2946 del infodict['__files_to_move'][old_filename]
2947 return infodict
2948
2949 @staticmethod
2950 def post_extract(info_dict):
2951 def actual_post_extract(info_dict):
2952 if info_dict.get('_type') in ('playlist', 'multi_video'):
2953 for video_dict in info_dict.get('entries', {}):
2954 actual_post_extract(video_dict or {})
2955 return
2956
2957 post_extractor = info_dict.get('__post_extractor') or (lambda: {})
2958 extra = post_extractor().items()
2959 info_dict.update(extra)
2960 info_dict.pop('__post_extractor', None)
2961
2962 original_infodict = info_dict.get('__original_infodict') or {}
2963 original_infodict.update(extra)
2964 original_infodict.pop('__post_extractor', None)
2965
2966 actual_post_extract(info_dict or {})
2967
2968 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
2969 info = dict(ie_info)
2970 info['__files_to_move'] = files_to_move or {}
2971 for pp in self._pps[key]:
2972 info = self.run_pp(pp, info)
2973 return info, info.pop('__files_to_move', None)
2974
2975 def post_process(self, filename, ie_info, files_to_move=None):
2976 """Run all the postprocessors on the given file."""
2977 info = dict(ie_info)
2978 info['filepath'] = filename
2979 info['__files_to_move'] = files_to_move or {}
2980
2981 for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
2982 info = self.run_pp(pp, info)
2983 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
2984 del info['__files_to_move']
2985 for pp in self._pps['after_move']:
2986 info = self.run_pp(pp, info)
2987 return info
2988
2989 def _make_archive_id(self, info_dict):
2990 video_id = info_dict.get('id')
2991 if not video_id:
2992 return
2993 # Future-proof against any change in case
2994 # and backwards compatibility with prior versions
2995 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
2996 if extractor is None:
2997 url = str_or_none(info_dict.get('url'))
2998 if not url:
2999 return
3000 # Try to find matching extractor for the URL and take its ie_key
3001 for ie_key, ie in self._ies.items():
3002 if ie.suitable(url):
3003 extractor = ie_key
3004 break
3005 else:
3006 return
3007 return '%s %s' % (extractor.lower(), video_id)
3008
3009 def in_download_archive(self, info_dict):
3010 fn = self.params.get('download_archive')
3011 if fn is None:
3012 return False
3013
3014 vid_id = self._make_archive_id(info_dict)
3015 if not vid_id:
3016 return False # Incomplete video information
3017
3018 return vid_id in self.archive
3019
3020 def record_download_archive(self, info_dict):
3021 fn = self.params.get('download_archive')
3022 if fn is None:
3023 return
3024 vid_id = self._make_archive_id(info_dict)
3025 assert vid_id
3026 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3027 archive_file.write(vid_id + '\n')
3028 self.archive.add(vid_id)
3029
3030 @staticmethod
3031 def format_resolution(format, default='unknown'):
3032 if format.get('vcodec') == 'none':
3033 if format.get('acodec') == 'none':
3034 return 'images'
3035 return 'audio only'
3036 if format.get('resolution') is not None:
3037 return format['resolution']
3038 if format.get('width') and format.get('height'):
3039 res = '%dx%d' % (format['width'], format['height'])
3040 elif format.get('height'):
3041 res = '%sp' % format['height']
3042 elif format.get('width'):
3043 res = '%dx?' % format['width']
3044 else:
3045 res = default
3046 return res
3047
3048 def _format_note(self, fdict):
3049 res = ''
3050 if fdict.get('ext') in ['f4f', 'f4m']:
3051 res += '(unsupported) '
3052 if fdict.get('language'):
3053 if res:
3054 res += ' '
3055 res += '[%s] ' % fdict['language']
3056 if fdict.get('format_note') is not None:
3057 res += fdict['format_note'] + ' '
3058 if fdict.get('tbr') is not None:
3059 res += '%4dk ' % fdict['tbr']
3060 if fdict.get('container') is not None:
3061 if res:
3062 res += ', '
3063 res += '%s container' % fdict['container']
3064 if (fdict.get('vcodec') is not None
3065 and fdict.get('vcodec') != 'none'):
3066 if res:
3067 res += ', '
3068 res += fdict['vcodec']
3069 if fdict.get('vbr') is not None:
3070 res += '@'
3071 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3072 res += 'video@'
3073 if fdict.get('vbr') is not None:
3074 res += '%4dk' % fdict['vbr']
3075 if fdict.get('fps') is not None:
3076 if res:
3077 res += ', '
3078 res += '%sfps' % fdict['fps']
3079 if fdict.get('acodec') is not None:
3080 if res:
3081 res += ', '
3082 if fdict['acodec'] == 'none':
3083 res += 'video only'
3084 else:
3085 res += '%-5s' % fdict['acodec']
3086 elif fdict.get('abr') is not None:
3087 if res:
3088 res += ', '
3089 res += 'audio'
3090 if fdict.get('abr') is not None:
3091 res += '@%3dk' % fdict['abr']
3092 if fdict.get('asr') is not None:
3093 res += ' (%5dHz)' % fdict['asr']
3094 if fdict.get('filesize') is not None:
3095 if res:
3096 res += ', '
3097 res += format_bytes(fdict['filesize'])
3098 elif fdict.get('filesize_approx') is not None:
3099 if res:
3100 res += ', '
3101 res += '~' + format_bytes(fdict['filesize_approx'])
3102 return res
3103
3104 def list_formats(self, info_dict):
3105 formats = info_dict.get('formats', [info_dict])
3106 new_format = (
3107 'list-formats' not in self.params.get('compat_opts', [])
3108 and self.params.get('listformats_table', True) is not False)
3109 if new_format:
3110 table = [
3111 [
3112 format_field(f, 'format_id'),
3113 format_field(f, 'ext'),
3114 self.format_resolution(f),
3115 format_field(f, 'fps', '%d'),
3116 '|',
3117 format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
3118 format_field(f, 'tbr', '%4dk'),
3119 shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
3120 '|',
3121 format_field(f, 'vcodec', default='unknown').replace('none', ''),
3122 format_field(f, 'vbr', '%4dk'),
3123 format_field(f, 'acodec', default='unknown').replace('none', ''),
3124 format_field(f, 'abr', '%3dk'),
3125 format_field(f, 'asr', '%5dHz'),
3126 ', '.join(filter(None, (
3127 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
3128 format_field(f, 'language', '[%s]'),
3129 format_field(f, 'format_note'),
3130 format_field(f, 'container', ignore=(None, f.get('ext'))),
3131 ))),
3132 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3133 header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO',
3134 '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO']
3135 else:
3136 table = [
3137 [
3138 format_field(f, 'format_id'),
3139 format_field(f, 'ext'),
3140 self.format_resolution(f),
3141 self._format_note(f)]
3142 for f in formats
3143 if f.get('preference') is None or f['preference'] >= -1000]
3144 header_line = ['format code', 'extension', 'resolution', 'note']
3145
3146 self.to_screen(
3147 '[info] Available formats for %s:' % info_dict['id'])
3148 self.to_stdout(render_table(
3149 header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format))
3150
3151 def list_thumbnails(self, info_dict):
3152 thumbnails = list(info_dict.get('thumbnails'))
3153 if not thumbnails:
3154 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3155 return
3156
3157 self.to_screen(
3158 '[info] Thumbnails for %s:' % info_dict['id'])
3159 self.to_stdout(render_table(
3160 ['ID', 'width', 'height', 'URL'],
3161 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3162
3163 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3164 if not subtitles:
3165 self.to_screen('%s has no %s' % (video_id, name))
3166 return
3167 self.to_screen(
3168 'Available %s for %s:' % (name, video_id))
3169
3170 def _row(lang, formats):
3171 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3172 if len(set(names)) == 1:
3173 names = [] if names[0] == 'unknown' else names[:1]
3174 return [lang, ', '.join(names), ', '.join(exts)]
3175
3176 self.to_stdout(render_table(
3177 ['Language', 'Name', 'Formats'],
3178 [_row(lang, formats) for lang, formats in subtitles.items()],
3179 hideEmpty=True))
3180
3181 def urlopen(self, req):
3182 """ Start an HTTP download """
3183 if isinstance(req, compat_basestring):
3184 req = sanitized_Request(req)
3185 return self._opener.open(req, timeout=self._socket_timeout)
3186
3187 def print_debug_header(self):
3188 if not self.params.get('verbose'):
3189 return
3190
3191 stdout_encoding = getattr(
3192 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
3193 encoding_str = (
3194 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
3195 locale.getpreferredencoding(),
3196 sys.getfilesystemencoding(),
3197 stdout_encoding,
3198 self.get_encoding()))
3199 write_string(encoding_str, encoding=None)
3200
3201 source = detect_variant()
3202 self._write_string('[debug] yt-dlp version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})'))
3203 if _LAZY_LOADER:
3204 self._write_string('[debug] Lazy loading extractors enabled\n')
3205 if plugin_extractors or plugin_postprocessors:
3206 self._write_string('[debug] Plugins: %s\n' % [
3207 '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
3208 for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
3209 if self.params.get('compat_opts'):
3210 self._write_string(
3211 '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts')))
3212 try:
3213 sp = subprocess.Popen(
3214 ['git', 'rev-parse', '--short', 'HEAD'],
3215 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3216 cwd=os.path.dirname(os.path.abspath(__file__)))
3217 out, err = process_communicate_or_kill(sp)
3218 out = out.decode().strip()
3219 if re.match('[0-9a-f]+', out):
3220 self._write_string('[debug] Git HEAD: %s\n' % out)
3221 except Exception:
3222 try:
3223 sys.exc_clear()
3224 except Exception:
3225 pass
3226
3227 def python_implementation():
3228 impl_name = platform.python_implementation()
3229 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3230 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3231 return impl_name
3232
3233 self._write_string('[debug] Python version %s (%s %s) - %s\n' % (
3234 platform.python_version(),
3235 python_implementation(),
3236 platform.architecture()[0],
3237 platform_name()))
3238
3239 exe_versions = FFmpegPostProcessor.get_versions(self)
3240 exe_versions['rtmpdump'] = rtmpdump_version()
3241 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3242 exe_str = ', '.join(
3243 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3244 ) or 'none'
3245 self._write_string('[debug] exe versions: %s\n' % exe_str)
3246
3247 from .downloader.websocket import has_websockets
3248 from .postprocessor.embedthumbnail import has_mutagen
3249 from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
3250
3251 lib_str = ', '.join(sorted(filter(None, (
3252 compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0],
3253 has_websockets and 'websockets',
3254 has_mutagen and 'mutagen',
3255 SQLITE_AVAILABLE and 'sqlite',
3256 KEYRING_AVAILABLE and 'keyring',
3257 )))) or 'none'
3258 self._write_string('[debug] Optional libraries: %s\n' % lib_str)
3259
3260 proxy_map = {}
3261 for handler in self._opener.handlers:
3262 if hasattr(handler, 'proxies'):
3263 proxy_map.update(handler.proxies)
3264 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
3265
3266 if self.params.get('call_home', False):
3267 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3268 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
3269 return
3270 latest_version = self.urlopen(
3271 'https://yt-dl.org/latest/version').read().decode('utf-8')
3272 if version_tuple(latest_version) > version_tuple(__version__):
3273 self.report_warning(
3274 'You are using an outdated version (newest version: %s)! '
3275 'See https://yt-dl.org/update if you need help updating.' %
3276 latest_version)
3277
3278 def _setup_opener(self):
3279 timeout_val = self.params.get('socket_timeout')
3280 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
3281
3282 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3283 opts_cookiefile = self.params.get('cookiefile')
3284 opts_proxy = self.params.get('proxy')
3285
3286 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3287
3288 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3289 if opts_proxy is not None:
3290 if opts_proxy == '':
3291 proxies = {}
3292 else:
3293 proxies = {'http': opts_proxy, 'https': opts_proxy}
3294 else:
3295 proxies = compat_urllib_request.getproxies()
3296 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3297 if 'http' in proxies and 'https' not in proxies:
3298 proxies['https'] = proxies['http']
3299 proxy_handler = PerRequestProxyHandler(proxies)
3300
3301 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3302 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3303 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3304 redirect_handler = YoutubeDLRedirectHandler()
3305 data_handler = compat_urllib_request_DataHandler()
3306
3307 # When passing our own FileHandler instance, build_opener won't add the
3308 # default FileHandler and allows us to disable the file protocol, which
3309 # can be used for malicious purposes (see
3310 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3311 file_handler = compat_urllib_request.FileHandler()
3312
3313 def file_open(*args, **kwargs):
3314 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3315 file_handler.file_open = file_open
3316
3317 opener = compat_urllib_request.build_opener(
3318 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3319
3320 # Delete the default user-agent header, which would otherwise apply in
3321 # cases where our custom HTTP handler doesn't come into play
3322 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3323 opener.addheaders = []
3324 self._opener = opener
3325
3326 def encode(self, s):
3327 if isinstance(s, bytes):
3328 return s # Already encoded
3329
3330 try:
3331 return s.encode(self.get_encoding())
3332 except UnicodeEncodeError as err:
3333 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3334 raise
3335
3336 def get_encoding(self):
3337 encoding = self.params.get('encoding')
3338 if encoding is None:
3339 encoding = preferredencoding()
3340 return encoding
3341
3342 def _write_info_json(self, label, ie_result, infofn):
3343 ''' Write infojson and returns True = written, False = skip, None = error '''
3344 if not self.params.get('writeinfojson'):
3345 return False
3346 elif not infofn:
3347 self.write_debug(f'Skipping writing {label} infojson')
3348 return False
3349 elif not self._ensure_dir_exists(infofn):
3350 return None
3351 elif not self.params.get('overwrites', True) and os.path.exists(infofn):
3352 self.to_screen(f'[info] {label.title()} metadata is already present')
3353 else:
3354 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
3355 try:
3356 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
3357 except (OSError, IOError):
3358 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
3359 return None
3360 return True
3361
3362 def _write_description(self, label, ie_result, descfn):
3363 ''' Write description and returns True = written, False = skip, None = error '''
3364 if not self.params.get('writedescription'):
3365 return False
3366 elif not descfn:
3367 self.write_debug(f'Skipping writing {label} description')
3368 return False
3369 elif not self._ensure_dir_exists(descfn):
3370 return None
3371 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
3372 self.to_screen(f'[info] {label.title()} description is already present')
3373 elif ie_result.get('description') is None:
3374 self.report_warning(f'There\'s no {label} description to write')
3375 return False
3376 else:
3377 try:
3378 self.to_screen(f'[info] Writing {label} description to: {descfn}')
3379 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
3380 descfile.write(ie_result['description'])
3381 except (OSError, IOError):
3382 self.report_error(f'Cannot write {label} description file {descfn}')
3383 return None
3384 return True
3385
3386 def _write_subtitles(self, info_dict, filename):
3387 ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
3388 ret = []
3389 subtitles = info_dict.get('requested_subtitles')
3390 if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
3391 # subtitles download errors are already managed as troubles in relevant IE
3392 # that way it will silently go on when used with unsupporting IE
3393 return ret
3394
3395 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
3396 if not sub_filename_base:
3397 self.to_screen('[info] Skipping writing video subtitles')
3398 return ret
3399 for sub_lang, sub_info in subtitles.items():
3400 sub_format = sub_info['ext']
3401 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
3402 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
3403 if not self.params.get('overwrites', True) and os.path.exists(sub_filename):
3404 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
3405 sub_info['filepath'] = sub_filename
3406 ret.append((sub_filename, sub_filename_final))
3407 continue
3408
3409 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
3410 if sub_info.get('data') is not None:
3411 try:
3412 # Use newline='' to prevent conversion of newline characters
3413 # See https://github.com/ytdl-org/youtube-dl/issues/10268
3414 with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
3415 subfile.write(sub_info['data'])
3416 sub_info['filepath'] = sub_filename
3417 ret.append((sub_filename, sub_filename_final))
3418 continue
3419 except (OSError, IOError):
3420 self.report_error(f'Cannot write video subtitles file {sub_filename}')
3421 return None
3422
3423 try:
3424 sub_copy = sub_info.copy()
3425 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
3426 self.dl(sub_filename, sub_copy, subtitle=True)
3427 sub_info['filepath'] = sub_filename
3428 ret.append((sub_filename, sub_filename_final))
3429 except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
3430 self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
3431 continue
3432 return ret
3433
3434 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
3435 ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) '''
3436 write_all = self.params.get('write_all_thumbnails', False)
3437 thumbnails, ret = [], []
3438 if write_all or self.params.get('writethumbnail', False):
3439 thumbnails = info_dict.get('thumbnails') or []
3440 multiple = write_all and len(thumbnails) > 1
3441
3442 if thumb_filename_base is None:
3443 thumb_filename_base = filename
3444 if thumbnails and not thumb_filename_base:
3445 self.write_debug(f'Skipping writing {label} thumbnail')
3446 return ret
3447
3448 for t in thumbnails[::-1]:
3449 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
3450 thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '')
3451 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
3452 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
3453
3454 if not self.params.get('overwrites', True) and os.path.exists(thumb_filename):
3455 ret.append((thumb_filename, thumb_filename_final))
3456 t['filepath'] = thumb_filename
3457 self.to_screen(f'[info] {thumb_display_id.title()} is already present')
3458 else:
3459 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
3460 try:
3461 uf = self.urlopen(t['url'])
3462 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
3463 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3464 shutil.copyfileobj(uf, thumbf)
3465 ret.append((thumb_filename, thumb_filename_final))
3466 t['filepath'] = thumb_filename
3467 except network_exceptions as err:
3468 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
3469 if ret and not write_all:
3470 break
3471 return ret