]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
[Mediaklikk] Add Extractor (#867)
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import sys
23 import tempfile
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30 from zipimport import zipimporter
31
32 from .compat import (
33 compat_basestring,
34 compat_get_terminal_size,
35 compat_kwargs,
36 compat_numeric_types,
37 compat_os_name,
38 compat_shlex_quote,
39 compat_str,
40 compat_tokenize_tokenize,
41 compat_urllib_error,
42 compat_urllib_request,
43 compat_urllib_request_DataHandler,
44 )
45 from .cookies import load_cookies
46 from .utils import (
47 age_restricted,
48 args_to_str,
49 ContentTooShortError,
50 date_from_str,
51 DateRange,
52 DEFAULT_OUTTMPL,
53 determine_ext,
54 determine_protocol,
55 DOT_DESKTOP_LINK_TEMPLATE,
56 DOT_URL_LINK_TEMPLATE,
57 DOT_WEBLOC_LINK_TEMPLATE,
58 DownloadError,
59 encode_compat_str,
60 encodeFilename,
61 EntryNotInPlaylist,
62 error_to_compat_str,
63 ExistingVideoReached,
64 expand_path,
65 ExtractorError,
66 float_or_none,
67 format_bytes,
68 format_field,
69 STR_FORMAT_RE_TMPL,
70 STR_FORMAT_TYPES,
71 formatSeconds,
72 GeoRestrictedError,
73 HEADRequest,
74 int_or_none,
75 iri_to_uri,
76 ISO3166Utils,
77 LazyList,
78 locked_file,
79 make_dir,
80 make_HTTPS_handler,
81 MaxDownloadsReached,
82 network_exceptions,
83 orderedSet,
84 OUTTMPL_TYPES,
85 PagedList,
86 parse_filesize,
87 PerRequestProxyHandler,
88 platform_name,
89 PostProcessingError,
90 preferredencoding,
91 prepend_extension,
92 process_communicate_or_kill,
93 register_socks_protocols,
94 RejectedVideoReached,
95 render_table,
96 replace_extension,
97 SameFileError,
98 sanitize_filename,
99 sanitize_path,
100 sanitize_url,
101 sanitized_Request,
102 std_headers,
103 str_or_none,
104 strftime_or_none,
105 subtitles_filename,
106 ThrottledDownload,
107 to_high_limit_path,
108 traverse_obj,
109 try_get,
110 UnavailableVideoError,
111 url_basename,
112 variadic,
113 version_tuple,
114 write_json_file,
115 write_string,
116 YoutubeDLCookieProcessor,
117 YoutubeDLHandler,
118 YoutubeDLRedirectHandler,
119 )
120 from .cache import Cache
121 from .extractor import (
122 gen_extractor_classes,
123 get_info_extractor,
124 _LAZY_LOADER,
125 _PLUGIN_CLASSES
126 )
127 from .extractor.openload import PhantomJSwrapper
128 from .downloader import (
129 FFmpegFD,
130 get_suitable_downloader,
131 shorten_protocol_name
132 )
133 from .downloader.rtmp import rtmpdump_version
134 from .postprocessor import (
135 get_postprocessor,
136 FFmpegFixupDurationPP,
137 FFmpegFixupM3u8PP,
138 FFmpegFixupM4aPP,
139 FFmpegFixupStretchedPP,
140 FFmpegFixupTimestampPP,
141 FFmpegMergerPP,
142 FFmpegPostProcessor,
143 MoveFilesAfterDownloadPP,
144 )
145 from .version import __version__
146
147 if compat_os_name == 'nt':
148 import ctypes
149
150
151 class YoutubeDL(object):
152 """YoutubeDL class.
153
154 YoutubeDL objects are the ones responsible of downloading the
155 actual video file and writing it to disk if the user has requested
156 it, among some other tasks. In most cases there should be one per
157 program. As, given a video URL, the downloader doesn't know how to
158 extract all the needed information, task that InfoExtractors do, it
159 has to pass the URL to one of them.
160
161 For this, YoutubeDL objects have a method that allows
162 InfoExtractors to be registered in a given order. When it is passed
163 a URL, the YoutubeDL object handles it to the first InfoExtractor it
164 finds that reports being able to handle it. The InfoExtractor extracts
165 all the information about the video or videos the URL refers to, and
166 YoutubeDL process the extracted information, possibly using a File
167 Downloader to download the video.
168
169 YoutubeDL objects accept a lot of parameters. In order not to saturate
170 the object constructor with arguments, it receives a dictionary of
171 options instead. These options are available through the params
172 attribute for the InfoExtractors to use. The YoutubeDL also
173 registers itself as the downloader in charge for the InfoExtractors
174 that are added to it, so this is a "mutual registration".
175
176 Available options:
177
178 username: Username for authentication purposes.
179 password: Password for authentication purposes.
180 videopassword: Password for accessing a video.
181 ap_mso: Adobe Pass multiple-system operator identifier.
182 ap_username: Multiple-system operator account username.
183 ap_password: Multiple-system operator account password.
184 usenetrc: Use netrc for authentication instead.
185 verbose: Print additional info to stdout.
186 quiet: Do not print messages to stdout.
187 no_warnings: Do not print out anything for warnings.
188 forceprint: A list of templates to force print
189 forceurl: Force printing final URL. (Deprecated)
190 forcetitle: Force printing title. (Deprecated)
191 forceid: Force printing ID. (Deprecated)
192 forcethumbnail: Force printing thumbnail URL. (Deprecated)
193 forcedescription: Force printing description. (Deprecated)
194 forcefilename: Force printing final filename. (Deprecated)
195 forceduration: Force printing duration. (Deprecated)
196 forcejson: Force printing info_dict as JSON.
197 dump_single_json: Force printing the info_dict of the whole playlist
198 (or video) as a single JSON line.
199 force_write_download_archive: Force writing download archive regardless
200 of 'skip_download' or 'simulate'.
201 simulate: Do not download the video files. If unset (or None),
202 simulate only if listsubtitles, listformats or list_thumbnails is used
203 format: Video format code. see "FORMAT SELECTION" for more details.
204 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
205 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
206 extracting metadata even if the video is not actually
207 available for download (experimental)
208 format_sort: How to sort the video formats. see "Sorting Formats"
209 for more details.
210 format_sort_force: Force the given format_sort. see "Sorting Formats"
211 for more details.
212 allow_multiple_video_streams: Allow multiple video streams to be merged
213 into a single file
214 allow_multiple_audio_streams: Allow multiple audio streams to be merged
215 into a single file
216 check_formats Whether to test if the formats are downloadable.
217 Can be True (check all), False (check none)
218 or None (check only if requested by extractor)
219 paths: Dictionary of output paths. The allowed keys are 'home'
220 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
221 outtmpl: Dictionary of templates for output names. Allowed keys
222 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
223 For compatibility with youtube-dl, a single string can also be used
224 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
225 restrictfilenames: Do not allow "&" and spaces in file names
226 trim_file_name: Limit length of filename (extension excluded)
227 windowsfilenames: Force the filenames to be windows compatible
228 ignoreerrors: Do not stop on download errors
229 (Default True when running yt-dlp,
230 but False when directly accessing YoutubeDL class)
231 skip_playlist_after_errors: Number of allowed failures until the rest of
232 the playlist is skipped
233 force_generic_extractor: Force downloader to use the generic extractor
234 overwrites: Overwrite all video and metadata files if True,
235 overwrite only non-video files if None
236 and don't overwrite any file if False
237 For compatibility with youtube-dl,
238 "nooverwrites" may also be used instead
239 playliststart: Playlist item to start at.
240 playlistend: Playlist item to end at.
241 playlist_items: Specific indices of playlist to download.
242 playlistreverse: Download playlist items in reverse order.
243 playlistrandom: Download playlist items in random order.
244 matchtitle: Download only matching titles.
245 rejecttitle: Reject downloads for matching titles.
246 logger: Log messages to a logging.Logger instance.
247 logtostderr: Log messages to stderr instead of stdout.
248 writedescription: Write the video description to a .description file
249 writeinfojson: Write the video description to a .info.json file
250 clean_infojson: Remove private fields from the infojson
251 getcomments: Extract video comments. This will not be written to disk
252 unless writeinfojson is also given
253 writeannotations: Write the video annotations to a .annotations.xml file
254 writethumbnail: Write the thumbnail image to a file
255 allow_playlist_files: Whether to write playlists' description, infojson etc
256 also to disk when using the 'write*' options
257 write_all_thumbnails: Write all thumbnail formats to files
258 writelink: Write an internet shortcut file, depending on the
259 current platform (.url/.webloc/.desktop)
260 writeurllink: Write a Windows internet shortcut file (.url)
261 writewebloclink: Write a macOS internet shortcut file (.webloc)
262 writedesktoplink: Write a Linux internet shortcut file (.desktop)
263 writesubtitles: Write the video subtitles to a file
264 writeautomaticsub: Write the automatically generated subtitles to a file
265 allsubtitles: Deprecated - Use subtitleslangs = ['all']
266 Downloads all the subtitles of the video
267 (requires writesubtitles or writeautomaticsub)
268 listsubtitles: Lists all available subtitles for the video
269 subtitlesformat: The format code for subtitles
270 subtitleslangs: List of languages of the subtitles to download (can be regex).
271 The list may contain "all" to refer to all the available
272 subtitles. The language can be prefixed with a "-" to
273 exclude it from the requested languages. Eg: ['all', '-live_chat']
274 keepvideo: Keep the video file after post-processing
275 daterange: A DateRange object, download only if the upload_date is in the range.
276 skip_download: Skip the actual download of the video file
277 cachedir: Location of the cache files in the filesystem.
278 False to disable filesystem cache.
279 noplaylist: Download single video instead of a playlist if in doubt.
280 age_limit: An integer representing the user's age in years.
281 Unsuitable videos for the given age are skipped.
282 min_views: An integer representing the minimum view count the video
283 must have in order to not be skipped.
284 Videos without view count information are always
285 downloaded. None for no limit.
286 max_views: An integer representing the maximum view count.
287 Videos that are more popular than that are not
288 downloaded.
289 Videos without view count information are always
290 downloaded. None for no limit.
291 download_archive: File name of a file where all downloads are recorded.
292 Videos already present in the file are not downloaded
293 again.
294 break_on_existing: Stop the download process after attempting to download a
295 file that is in the archive.
296 break_on_reject: Stop the download process when encountering a video that
297 has been filtered out.
298 cookiefile: File name where cookies should be read from and dumped to
299 cookiesfrombrowser: A tuple containing the name of the browser and the profile
300 name/path from where cookies are loaded.
301 Eg: ('chrome', ) or (vivaldi, 'default')
302 nocheckcertificate:Do not verify SSL certificates
303 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
304 At the moment, this is only supported by YouTube.
305 proxy: URL of the proxy server to use
306 geo_verification_proxy: URL of the proxy to use for IP address verification
307 on geo-restricted sites.
308 socket_timeout: Time to wait for unresponsive hosts, in seconds
309 bidi_workaround: Work around buggy terminals without bidirectional text
310 support, using fridibi
311 debug_printtraffic:Print out sent and received HTTP traffic
312 include_ads: Download ads as well
313 default_search: Prepend this string if an input url is not valid.
314 'auto' for elaborate guessing
315 encoding: Use this encoding instead of the system-specified.
316 extract_flat: Do not resolve URLs, return the immediate result.
317 Pass in 'in_playlist' to only show this behavior for
318 playlist items.
319 postprocessors: A list of dictionaries, each with an entry
320 * key: The name of the postprocessor. See
321 yt_dlp/postprocessor/__init__.py for a list.
322 * when: When to run the postprocessor. Can be one of
323 pre_process|before_dl|post_process|after_move.
324 Assumed to be 'post_process' if not given
325 post_hooks: A list of functions that get called as the final step
326 for each video file, after all postprocessors have been
327 called. The filename will be passed as the only argument.
328 progress_hooks: A list of functions that get called on download
329 progress, with a dictionary with the entries
330 * status: One of "downloading", "error", or "finished".
331 Check this first and ignore unknown values.
332 * info_dict: The extracted info_dict
333
334 If status is one of "downloading", or "finished", the
335 following properties may also be present:
336 * filename: The final filename (always present)
337 * tmpfilename: The filename we're currently writing to
338 * downloaded_bytes: Bytes on disk
339 * total_bytes: Size of the whole file, None if unknown
340 * total_bytes_estimate: Guess of the eventual file size,
341 None if unavailable.
342 * elapsed: The number of seconds since download started.
343 * eta: The estimated time in seconds, None if unknown
344 * speed: The download speed in bytes/second, None if
345 unknown
346 * fragment_index: The counter of the currently
347 downloaded video fragment.
348 * fragment_count: The number of fragments (= individual
349 files that will be merged)
350
351 Progress hooks are guaranteed to be called at least once
352 (with status "finished") if the download is successful.
353 merge_output_format: Extension to use when merging formats.
354 final_ext: Expected final extension; used to detect when the file was
355 already downloaded and converted. "merge_output_format" is
356 replaced by this extension when given
357 fixup: Automatically correct known faults of the file.
358 One of:
359 - "never": do nothing
360 - "warn": only emit a warning
361 - "detect_or_warn": check whether we can do anything
362 about it, warn otherwise (default)
363 source_address: Client-side IP address to bind to.
364 call_home: Boolean, true iff we are allowed to contact the
365 yt-dlp servers for debugging. (BROKEN)
366 sleep_interval_requests: Number of seconds to sleep between requests
367 during extraction
368 sleep_interval: Number of seconds to sleep before each download when
369 used alone or a lower bound of a range for randomized
370 sleep before each download (minimum possible number
371 of seconds to sleep) when used along with
372 max_sleep_interval.
373 max_sleep_interval:Upper bound of a range for randomized sleep before each
374 download (maximum possible number of seconds to sleep).
375 Must only be used along with sleep_interval.
376 Actual sleep time will be a random float from range
377 [sleep_interval; max_sleep_interval].
378 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
379 listformats: Print an overview of available video formats and exit.
380 list_thumbnails: Print a table of all thumbnails and exit.
381 match_filter: A function that gets called with the info_dict of
382 every video.
383 If it returns a message, the video is ignored.
384 If it returns None, the video is downloaded.
385 match_filter_func in utils.py is one example for this.
386 no_color: Do not emit color codes in output.
387 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
388 HTTP header
389 geo_bypass_country:
390 Two-letter ISO 3166-2 country code that will be used for
391 explicit geographic restriction bypassing via faking
392 X-Forwarded-For HTTP header
393 geo_bypass_ip_block:
394 IP range in CIDR notation that will be used similarly to
395 geo_bypass_country
396
397 The following options determine which downloader is picked:
398 external_downloader: A dictionary of protocol keys and the executable of the
399 external downloader to use for it. The allowed protocols
400 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
401 Set the value to 'native' to use the native downloader
402 hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'}
403 or {'m3u8': 'ffmpeg'} instead.
404 Use the native HLS downloader instead of ffmpeg/avconv
405 if True, otherwise use ffmpeg/avconv if False, otherwise
406 use downloader suggested by extractor if None.
407 compat_opts: Compatibility options. See "Differences in default behavior".
408 The following options do not work when used through the API:
409 filename, abort-on-error, multistreams, no-live-chat,
410 no-clean-infojson, no-playlist-metafiles, no-keep-subs.
411 Refer __init__.py for their implementation
412
413 The following parameters are not used by YoutubeDL itself, they are used by
414 the downloader (see yt_dlp/downloader/common.py):
415 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
416 max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle,
417 xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size.
418
419 The following options are used by the post processors:
420 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
421 otherwise prefer ffmpeg. (avconv support is deprecated)
422 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
423 to the binary or its containing directory.
424 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
425 and a list of additional command-line arguments for the
426 postprocessor/executable. The dict can also have "PP+EXE" keys
427 which are used when the given exe is used by the given PP.
428 Use 'default' as the name for arguments to passed to all PP
429 For compatibility with youtube-dl, a single list of args
430 can also be used
431
432 The following options are used by the extractors:
433 extractor_retries: Number of times to retry for known errors
434 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
435 hls_split_discontinuity: Split HLS playlists to different formats at
436 discontinuities such as ad breaks (default: False)
437 extractor_args: A dictionary of arguments to be passed to the extractors.
438 See "EXTRACTOR ARGUMENTS" for details.
439 Eg: {'youtube': {'skip': ['dash', 'hls']}}
440 youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
441 If True (default), DASH manifests and related
442 data will be downloaded and processed by extractor.
443 You can reduce network I/O by disabling it if you don't
444 care about DASH. (only for youtube)
445 youtube_include_hls_manifest: Deprecated - Use extractor_args instead.
446 If True (default), HLS manifests and related
447 data will be downloaded and processed by extractor.
448 You can reduce network I/O by disabling it if you don't
449 care about HLS. (only for youtube)
450 """
451
452 _NUMERIC_FIELDS = set((
453 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
454 'timestamp', 'upload_year', 'upload_month', 'upload_day',
455 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
456 'average_rating', 'comment_count', 'age_limit',
457 'start_time', 'end_time',
458 'chapter_number', 'season_number', 'episode_number',
459 'track_number', 'disc_number', 'release_year',
460 'playlist_index',
461 ))
462
463 params = None
464 _ies = {}
465 _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
466 _printed_messages = set()
467 _first_webpage_request = True
468 _download_retcode = None
469 _num_downloads = None
470 _playlist_level = 0
471 _playlist_urls = set()
472 _screen_file = None
473
474 def __init__(self, params=None, auto_init=True):
475 """Create a FileDownloader object with the given options."""
476 if params is None:
477 params = {}
478 self._ies = {}
479 self._ies_instances = {}
480 self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
481 self._printed_messages = set()
482 self._first_webpage_request = True
483 self._post_hooks = []
484 self._progress_hooks = []
485 self._download_retcode = 0
486 self._num_downloads = 0
487 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
488 self._err_file = sys.stderr
489 self.params = {
490 # Default parameters
491 'nocheckcertificate': False,
492 }
493 self.params.update(params)
494 self.cache = Cache(self)
495
496 if sys.version_info < (3, 6):
497 self.report_warning(
498 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
499
500 if self.params.get('allow_unplayable_formats'):
501 self.report_warning(
502 'You have asked for unplayable formats to be listed/downloaded. '
503 'This is a developer option intended for debugging. '
504 'If you experience any issues while using this option, DO NOT open a bug report')
505
506 def check_deprecated(param, option, suggestion):
507 if self.params.get(param) is not None:
508 self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion))
509 return True
510 return False
511
512 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
513 if self.params.get('geo_verification_proxy') is None:
514 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
515
516 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
517 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
518 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
519
520 for msg in self.params.get('warnings', []):
521 self.report_warning(msg)
522
523 if self.params.get('overwrites') is None:
524 self.params.pop('overwrites', None)
525 elif self.params.get('nooverwrites') is not None:
526 # nooverwrites was unnecessarily changed to overwrites
527 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
528 # This ensures compatibility with both keys
529 self.params['overwrites'] = not self.params['nooverwrites']
530 else:
531 self.params['nooverwrites'] = not self.params['overwrites']
532
533 if params.get('bidi_workaround', False):
534 try:
535 import pty
536 master, slave = pty.openpty()
537 width = compat_get_terminal_size().columns
538 if width is None:
539 width_args = []
540 else:
541 width_args = ['-w', str(width)]
542 sp_kwargs = dict(
543 stdin=subprocess.PIPE,
544 stdout=slave,
545 stderr=self._err_file)
546 try:
547 self._output_process = subprocess.Popen(
548 ['bidiv'] + width_args, **sp_kwargs
549 )
550 except OSError:
551 self._output_process = subprocess.Popen(
552 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
553 self._output_channel = os.fdopen(master, 'rb')
554 except OSError as ose:
555 if ose.errno == errno.ENOENT:
556 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
557 else:
558 raise
559
560 if (sys.platform != 'win32'
561 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
562 and not params.get('restrictfilenames', False)):
563 # Unicode filesystem API will throw errors (#1474, #13027)
564 self.report_warning(
565 'Assuming --restrict-filenames since file system encoding '
566 'cannot encode all characters. '
567 'Set the LC_ALL environment variable to fix this.')
568 self.params['restrictfilenames'] = True
569
570 self.outtmpl_dict = self.parse_outtmpl()
571
572 # Creating format selector here allows us to catch syntax errors before the extraction
573 self.format_selector = (
574 None if self.params.get('format') is None
575 else self.build_format_selector(self.params['format']))
576
577 self._setup_opener()
578
579 """Preload the archive, if any is specified"""
580 def preload_download_archive(fn):
581 if fn is None:
582 return False
583 self.write_debug('Loading archive file %r\n' % fn)
584 try:
585 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
586 for line in archive_file:
587 self.archive.add(line.strip())
588 except IOError as ioe:
589 if ioe.errno != errno.ENOENT:
590 raise
591 return False
592 return True
593
594 self.archive = set()
595 preload_download_archive(self.params.get('download_archive'))
596
597 if auto_init:
598 self.print_debug_header()
599 self.add_default_info_extractors()
600
601 for pp_def_raw in self.params.get('postprocessors', []):
602 pp_def = dict(pp_def_raw)
603 when = pp_def.pop('when', 'post_process')
604 pp_class = get_postprocessor(pp_def.pop('key'))
605 pp = pp_class(self, **compat_kwargs(pp_def))
606 self.add_post_processor(pp, when=when)
607
608 for ph in self.params.get('post_hooks', []):
609 self.add_post_hook(ph)
610
611 for ph in self.params.get('progress_hooks', []):
612 self.add_progress_hook(ph)
613
614 register_socks_protocols()
615
616 def warn_if_short_id(self, argv):
617 # short YouTube ID starting with dash?
618 idxs = [
619 i for i, a in enumerate(argv)
620 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
621 if idxs:
622 correct_argv = (
623 ['yt-dlp']
624 + [a for i, a in enumerate(argv) if i not in idxs]
625 + ['--'] + [argv[i] for i in idxs]
626 )
627 self.report_warning(
628 'Long argument string detected. '
629 'Use -- to separate parameters and URLs, like this:\n%s\n' %
630 args_to_str(correct_argv))
631
632 def add_info_extractor(self, ie):
633 """Add an InfoExtractor object to the end of the list."""
634 ie_key = ie.ie_key()
635 self._ies[ie_key] = ie
636 if not isinstance(ie, type):
637 self._ies_instances[ie_key] = ie
638 ie.set_downloader(self)
639
640 def _get_info_extractor_class(self, ie_key):
641 ie = self._ies.get(ie_key)
642 if ie is None:
643 ie = get_info_extractor(ie_key)
644 self.add_info_extractor(ie)
645 return ie
646
647 def get_info_extractor(self, ie_key):
648 """
649 Get an instance of an IE with name ie_key, it will try to get one from
650 the _ies list, if there's no instance it will create a new one and add
651 it to the extractor list.
652 """
653 ie = self._ies_instances.get(ie_key)
654 if ie is None:
655 ie = get_info_extractor(ie_key)()
656 self.add_info_extractor(ie)
657 return ie
658
659 def add_default_info_extractors(self):
660 """
661 Add the InfoExtractors returned by gen_extractors to the end of the list
662 """
663 for ie in gen_extractor_classes():
664 self.add_info_extractor(ie)
665
666 def add_post_processor(self, pp, when='post_process'):
667 """Add a PostProcessor object to the end of the chain."""
668 self._pps[when].append(pp)
669 pp.set_downloader(self)
670
671 def add_post_hook(self, ph):
672 """Add the post hook"""
673 self._post_hooks.append(ph)
674
675 def add_progress_hook(self, ph):
676 """Add the progress hook (currently only for the file downloader)"""
677 self._progress_hooks.append(ph)
678
679 def _bidi_workaround(self, message):
680 if not hasattr(self, '_output_channel'):
681 return message
682
683 assert hasattr(self, '_output_process')
684 assert isinstance(message, compat_str)
685 line_count = message.count('\n') + 1
686 self._output_process.stdin.write((message + '\n').encode('utf-8'))
687 self._output_process.stdin.flush()
688 res = ''.join(self._output_channel.readline().decode('utf-8')
689 for _ in range(line_count))
690 return res[:-len('\n')]
691
692 def _write_string(self, message, out=None, only_once=False):
693 if only_once:
694 if message in self._printed_messages:
695 return
696 self._printed_messages.add(message)
697 write_string(message, out=out, encoding=self.params.get('encoding'))
698
699 def to_stdout(self, message, skip_eol=False, quiet=False):
700 """Print message to stdout"""
701 if self.params.get('logger'):
702 self.params['logger'].debug(message)
703 elif not quiet or self.params.get('verbose'):
704 self._write_string(
705 '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
706 self._err_file if quiet else self._screen_file)
707
708 def to_stderr(self, message, only_once=False):
709 """Print message to stderr"""
710 assert isinstance(message, compat_str)
711 if self.params.get('logger'):
712 self.params['logger'].error(message)
713 else:
714 self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
715
716 def to_console_title(self, message):
717 if not self.params.get('consoletitle', False):
718 return
719 if compat_os_name == 'nt':
720 if ctypes.windll.kernel32.GetConsoleWindow():
721 # c_wchar_p() might not be necessary if `message` is
722 # already of type unicode()
723 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
724 elif 'TERM' in os.environ:
725 self._write_string('\033]0;%s\007' % message, self._screen_file)
726
727 def save_console_title(self):
728 if not self.params.get('consoletitle', False):
729 return
730 if self.params.get('simulate'):
731 return
732 if compat_os_name != 'nt' and 'TERM' in os.environ:
733 # Save the title on stack
734 self._write_string('\033[22;0t', self._screen_file)
735
736 def restore_console_title(self):
737 if not self.params.get('consoletitle', False):
738 return
739 if self.params.get('simulate'):
740 return
741 if compat_os_name != 'nt' and 'TERM' in os.environ:
742 # Restore the title from stack
743 self._write_string('\033[23;0t', self._screen_file)
744
745 def __enter__(self):
746 self.save_console_title()
747 return self
748
749 def __exit__(self, *args):
750 self.restore_console_title()
751
752 if self.params.get('cookiefile') is not None:
753 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
754
755 def trouble(self, message=None, tb=None):
756 """Determine action to take when a download problem appears.
757
758 Depending on if the downloader has been configured to ignore
759 download errors or not, this method may throw an exception or
760 not when errors are found, after printing the message.
761
762 tb, if given, is additional traceback information.
763 """
764 if message is not None:
765 self.to_stderr(message)
766 if self.params.get('verbose'):
767 if tb is None:
768 if sys.exc_info()[0]: # if .trouble has been called from an except block
769 tb = ''
770 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
771 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
772 tb += encode_compat_str(traceback.format_exc())
773 else:
774 tb_data = traceback.format_list(traceback.extract_stack())
775 tb = ''.join(tb_data)
776 if tb:
777 self.to_stderr(tb)
778 if not self.params.get('ignoreerrors', False):
779 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
780 exc_info = sys.exc_info()[1].exc_info
781 else:
782 exc_info = sys.exc_info()
783 raise DownloadError(message, exc_info)
784 self._download_retcode = 1
785
786 def to_screen(self, message, skip_eol=False):
787 """Print message to stdout if not in quiet mode"""
788 self.to_stdout(
789 message, skip_eol, quiet=self.params.get('quiet', False))
790
791 def report_warning(self, message, only_once=False):
792 '''
793 Print the message to stderr, it will be prefixed with 'WARNING:'
794 If stderr is a tty file the 'WARNING:' will be colored
795 '''
796 if self.params.get('logger') is not None:
797 self.params['logger'].warning(message)
798 else:
799 if self.params.get('no_warnings'):
800 return
801 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
802 _msg_header = '\033[0;33mWARNING:\033[0m'
803 else:
804 _msg_header = 'WARNING:'
805 warning_message = '%s %s' % (_msg_header, message)
806 self.to_stderr(warning_message, only_once)
807
808 def report_error(self, message, tb=None):
809 '''
810 Do the same as trouble, but prefixes the message with 'ERROR:', colored
811 in red if stderr is a tty file.
812 '''
813 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
814 _msg_header = '\033[0;31mERROR:\033[0m'
815 else:
816 _msg_header = 'ERROR:'
817 error_message = '%s %s' % (_msg_header, message)
818 self.trouble(error_message, tb)
819
820 def write_debug(self, message, only_once=False):
821 '''Log debug message or Print message to stderr'''
822 if not self.params.get('verbose', False):
823 return
824 message = '[debug] %s' % message
825 if self.params.get('logger'):
826 self.params['logger'].debug(message)
827 else:
828 self.to_stderr(message, only_once)
829
830 def report_file_already_downloaded(self, file_name):
831 """Report file has already been fully downloaded."""
832 try:
833 self.to_screen('[download] %s has already been downloaded' % file_name)
834 except UnicodeEncodeError:
835 self.to_screen('[download] The file has already been downloaded')
836
837 def report_file_delete(self, file_name):
838 """Report that existing file will be deleted."""
839 try:
840 self.to_screen('Deleting existing file %s' % file_name)
841 except UnicodeEncodeError:
842 self.to_screen('Deleting existing file')
843
844 def raise_no_formats(self, info, forced=False):
845 has_drm = info.get('__has_drm')
846 msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
847 expected = self.params.get('ignore_no_formats_error')
848 if forced or not expected:
849 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
850 expected=has_drm or expected)
851 else:
852 self.report_warning(msg)
853
854 def parse_outtmpl(self):
855 outtmpl_dict = self.params.get('outtmpl', {})
856 if not isinstance(outtmpl_dict, dict):
857 outtmpl_dict = {'default': outtmpl_dict}
858 outtmpl_dict.update({
859 k: v for k, v in DEFAULT_OUTTMPL.items()
860 if not outtmpl_dict.get(k)})
861 for key, val in outtmpl_dict.items():
862 if isinstance(val, bytes):
863 self.report_warning(
864 'Parameter outtmpl is bytes, but should be a unicode string. '
865 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
866 return outtmpl_dict
867
868 def get_output_path(self, dir_type='', filename=None):
869 paths = self.params.get('paths', {})
870 assert isinstance(paths, dict)
871 path = os.path.join(
872 expand_path(paths.get('home', '').strip()),
873 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
874 filename or '')
875
876 # Temporary fix for #4787
877 # 'Treat' all problem characters by passing filename through preferredencoding
878 # to workaround encoding issues with subprocess on python2 @ Windows
879 if sys.version_info < (3, 0) and sys.platform == 'win32':
880 path = encodeFilename(path, True).decode(preferredencoding())
881 return sanitize_path(path, force=self.params.get('windowsfilenames'))
882
883 @staticmethod
884 def _outtmpl_expandpath(outtmpl):
885 # expand_path translates '%%' into '%' and '$$' into '$'
886 # correspondingly that is not what we want since we need to keep
887 # '%%' intact for template dict substitution step. Working around
888 # with boundary-alike separator hack.
889 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
890 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
891
892 # outtmpl should be expand_path'ed before template dict substitution
893 # because meta fields may contain env variables we don't want to
894 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
895 # title "Hello $PATH", we don't want `$PATH` to be expanded.
896 return expand_path(outtmpl).replace(sep, '')
897
898 @staticmethod
899 def escape_outtmpl(outtmpl):
900 ''' Escape any remaining strings like %s, %abc% etc. '''
901 return re.sub(
902 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
903 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
904 outtmpl)
905
906 @classmethod
907 def validate_outtmpl(cls, outtmpl):
908 ''' @return None or Exception object '''
909 outtmpl = re.sub(
910 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljq]'),
911 lambda mobj: f'{mobj.group(0)[:-1]}s',
912 cls._outtmpl_expandpath(outtmpl))
913 try:
914 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
915 return None
916 except ValueError as err:
917 return err
918
919 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
920 """ Make the template and info_dict suitable for substitution : ydl.outtmpl_escape(outtmpl) % info_dict """
921 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
922
923 info_dict = dict(info_dict) # Do not sanitize so as not to consume LazyList
924 for key in ('__original_infodict', '__postprocessors'):
925 info_dict.pop(key, None)
926 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
927 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
928 if info_dict.get('duration', None) is not None
929 else None)
930 info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
931 if info_dict.get('resolution') is None:
932 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
933
934 # For fields playlist_index and autonumber convert all occurrences
935 # of %(field)s to %(field)0Nd for backward compatibility
936 field_size_compat_map = {
937 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')),
938 'autonumber': self.params.get('autonumber_size') or 5,
939 }
940
941 TMPL_DICT = {}
942 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljq]'))
943 MATH_FUNCTIONS = {
944 '+': float.__add__,
945 '-': float.__sub__,
946 }
947 # Field is of the form key1.key2...
948 # where keys (except first) can be string, int or slice
949 FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
950 MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
951 MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
952 INTERNAL_FORMAT_RE = re.compile(r'''(?x)
953 (?P<negate>-)?
954 (?P<fields>{field})
955 (?P<maths>(?:{math_op}{math_field})*)
956 (?:>(?P<strf_format>.+?))?
957 (?:\|(?P<default>.*?))?
958 $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
959
960 def _traverse_infodict(k):
961 k = k.split('.')
962 if k[0] == '':
963 k.pop(0)
964 return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True)
965
966 def get_value(mdict):
967 # Object traversal
968 value = _traverse_infodict(mdict['fields'])
969 # Negative
970 if mdict['negate']:
971 value = float_or_none(value)
972 if value is not None:
973 value *= -1
974 # Do maths
975 offset_key = mdict['maths']
976 if offset_key:
977 value = float_or_none(value)
978 operator = None
979 while offset_key:
980 item = re.match(
981 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
982 offset_key).group(0)
983 offset_key = offset_key[len(item):]
984 if operator is None:
985 operator = MATH_FUNCTIONS[item]
986 continue
987 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
988 offset = float_or_none(item)
989 if offset is None:
990 offset = float_or_none(_traverse_infodict(item))
991 try:
992 value = operator(value, multiplier * offset)
993 except (TypeError, ZeroDivisionError):
994 return None
995 operator = None
996 # Datetime formatting
997 if mdict['strf_format']:
998 value = strftime_or_none(value, mdict['strf_format'])
999
1000 return value
1001
1002 na = self.params.get('outtmpl_na_placeholder', 'NA')
1003
1004 def _dumpjson_default(obj):
1005 if isinstance(obj, (set, LazyList)):
1006 return list(obj)
1007 raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
1008
1009 def create_key(outer_mobj):
1010 if not outer_mobj.group('has_key'):
1011 return f'%{outer_mobj.group(0)}'
1012 key = outer_mobj.group('key')
1013 mobj = re.match(INTERNAL_FORMAT_RE, key)
1014 if mobj is None:
1015 value, default, mobj = None, na, {'fields': ''}
1016 else:
1017 mobj = mobj.groupdict()
1018 default = mobj['default'] if mobj['default'] is not None else na
1019 value = get_value(mobj)
1020
1021 fmt = outer_mobj.group('format')
1022 if fmt == 's' and value is not None and key in field_size_compat_map.keys():
1023 fmt = '0{:d}d'.format(field_size_compat_map[key])
1024
1025 value = default if value is None else value
1026
1027 str_fmt = f'{fmt[:-1]}s'
1028 if fmt[-1] == 'l':
1029 value, fmt = ', '.join(variadic(value)), str_fmt
1030 elif fmt[-1] == 'j':
1031 value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
1032 elif fmt[-1] == 'q':
1033 value, fmt = compat_shlex_quote(str(value)), str_fmt
1034 elif fmt[-1] == 'c':
1035 value = str(value)
1036 if value is None:
1037 value, fmt = default, 's'
1038 else:
1039 value = value[0]
1040 elif fmt[-1] not in 'rs': # numeric
1041 value = float_or_none(value)
1042 if value is None:
1043 value, fmt = default, 's'
1044
1045 if sanitize:
1046 if fmt[-1] == 'r':
1047 # If value is an object, sanitize might convert it to a string
1048 # So we convert it to repr first
1049 value, fmt = repr(value), str_fmt
1050 if fmt[-1] in 'csr':
1051 value = sanitize(mobj['fields'].split('.')[-1], value)
1052
1053 key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
1054 TMPL_DICT[key] = value
1055 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1056
1057 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1058
1059 def _prepare_filename(self, info_dict, tmpl_type='default'):
1060 try:
1061 sanitize = lambda k, v: sanitize_filename(
1062 compat_str(v),
1063 restricted=self.params.get('restrictfilenames'),
1064 is_id=(k == 'id' or k.endswith('_id')))
1065 outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
1066 outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
1067 outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl))
1068 filename = outtmpl % template_dict
1069
1070 force_ext = OUTTMPL_TYPES.get(tmpl_type)
1071 if force_ext is not None:
1072 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1073
1074 # https://github.com/blackjack4494/youtube-dlc/issues/85
1075 trim_file_name = self.params.get('trim_file_name', False)
1076 if trim_file_name:
1077 fn_groups = filename.rsplit('.')
1078 ext = fn_groups[-1]
1079 sub_ext = ''
1080 if len(fn_groups) > 2:
1081 sub_ext = fn_groups[-2]
1082 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
1083
1084 return filename
1085 except ValueError as err:
1086 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1087 return None
1088
1089 def prepare_filename(self, info_dict, dir_type='', warn=False):
1090 """Generate the output filename."""
1091
1092 filename = self._prepare_filename(info_dict, dir_type or 'default')
1093
1094 if warn:
1095 if not self.params.get('paths'):
1096 pass
1097 elif filename == '-':
1098 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1099 elif os.path.isabs(filename):
1100 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1101 if filename == '-' or not filename:
1102 return filename
1103
1104 return self.get_output_path(dir_type, filename)
1105
1106 def _match_entry(self, info_dict, incomplete=False, silent=False):
1107 """ Returns None if the file should be downloaded """
1108
1109 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1110
1111 def check_filter():
1112 if 'title' in info_dict:
1113 # This can happen when we're just evaluating the playlist
1114 title = info_dict['title']
1115 matchtitle = self.params.get('matchtitle', False)
1116 if matchtitle:
1117 if not re.search(matchtitle, title, re.IGNORECASE):
1118 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1119 rejecttitle = self.params.get('rejecttitle', False)
1120 if rejecttitle:
1121 if re.search(rejecttitle, title, re.IGNORECASE):
1122 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1123 date = info_dict.get('upload_date')
1124 if date is not None:
1125 dateRange = self.params.get('daterange', DateRange())
1126 if date not in dateRange:
1127 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
1128 view_count = info_dict.get('view_count')
1129 if view_count is not None:
1130 min_views = self.params.get('min_views')
1131 if min_views is not None and view_count < min_views:
1132 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1133 max_views = self.params.get('max_views')
1134 if max_views is not None and view_count > max_views:
1135 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1136 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1137 return 'Skipping "%s" because it is age restricted' % video_title
1138
1139 match_filter = self.params.get('match_filter')
1140 if match_filter is not None:
1141 try:
1142 ret = match_filter(info_dict, incomplete=incomplete)
1143 except TypeError:
1144 # For backward compatibility
1145 ret = None if incomplete else match_filter(info_dict)
1146 if ret is not None:
1147 return ret
1148 return None
1149
1150 if self.in_download_archive(info_dict):
1151 reason = '%s has already been recorded in the archive' % video_title
1152 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1153 else:
1154 reason = check_filter()
1155 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1156 if reason is not None:
1157 if not silent:
1158 self.to_screen('[download] ' + reason)
1159 if self.params.get(break_opt, False):
1160 raise break_err()
1161 return reason
1162
1163 @staticmethod
1164 def add_extra_info(info_dict, extra_info):
1165 '''Set the keys from extra_info in info dict if they are missing'''
1166 for key, value in extra_info.items():
1167 info_dict.setdefault(key, value)
1168
1169 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1170 process=True, force_generic_extractor=False):
1171 """
1172 Return a list with a dictionary for each video extracted.
1173
1174 Arguments:
1175 url -- URL to extract
1176
1177 Keyword arguments:
1178 download -- whether to download videos during extraction
1179 ie_key -- extractor key hint
1180 extra_info -- dictionary containing the extra values to add to each result
1181 process -- whether to resolve all unresolved references (URLs, playlist items),
1182 must be True for download to work.
1183 force_generic_extractor -- force using the generic extractor
1184 """
1185
1186 if extra_info is None:
1187 extra_info = {}
1188
1189 if not ie_key and force_generic_extractor:
1190 ie_key = 'Generic'
1191
1192 if ie_key:
1193 ies = {ie_key: self._get_info_extractor_class(ie_key)}
1194 else:
1195 ies = self._ies
1196
1197 for ie_key, ie in ies.items():
1198 if not ie.suitable(url):
1199 continue
1200
1201 if not ie.working():
1202 self.report_warning('The program functionality for this site has been marked as broken, '
1203 'and will probably not work.')
1204
1205 temp_id = ie.get_temp_id(url)
1206 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1207 self.to_screen("[%s] %s: has already been recorded in archive" % (
1208 ie_key, temp_id))
1209 break
1210 return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
1211 else:
1212 self.report_error('no suitable InfoExtractor for URL %s' % url)
1213
1214 def __handle_extraction_exceptions(func):
1215
1216 def wrapper(self, *args, **kwargs):
1217 try:
1218 return func(self, *args, **kwargs)
1219 except GeoRestrictedError as e:
1220 msg = e.msg
1221 if e.countries:
1222 msg += '\nThis video is available in %s.' % ', '.join(
1223 map(ISO3166Utils.short2full, e.countries))
1224 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1225 self.report_error(msg)
1226 except ExtractorError as e: # An error we somewhat expected
1227 self.report_error(compat_str(e), e.format_traceback())
1228 except ThrottledDownload:
1229 self.to_stderr('\r')
1230 self.report_warning('The download speed is below throttle limit. Re-extracting data')
1231 return wrapper(self, *args, **kwargs)
1232 except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError):
1233 raise
1234 except Exception as e:
1235 if self.params.get('ignoreerrors', False):
1236 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1237 else:
1238 raise
1239 return wrapper
1240
1241 @__handle_extraction_exceptions
1242 def __extract_info(self, url, ie, download, extra_info, process):
1243 ie_result = ie.extract(url)
1244 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1245 return
1246 if isinstance(ie_result, list):
1247 # Backwards compatibility: old IE result format
1248 ie_result = {
1249 '_type': 'compat_list',
1250 'entries': ie_result,
1251 }
1252 if extra_info.get('original_url'):
1253 ie_result.setdefault('original_url', extra_info['original_url'])
1254 self.add_default_extra_info(ie_result, ie, url)
1255 if process:
1256 return self.process_ie_result(ie_result, download, extra_info)
1257 else:
1258 return ie_result
1259
1260 def add_default_extra_info(self, ie_result, ie, url):
1261 if url is not None:
1262 self.add_extra_info(ie_result, {
1263 'webpage_url': url,
1264 'original_url': url,
1265 'webpage_url_basename': url_basename(url),
1266 })
1267 if ie is not None:
1268 self.add_extra_info(ie_result, {
1269 'extractor': ie.IE_NAME,
1270 'extractor_key': ie.ie_key(),
1271 })
1272
1273 def process_ie_result(self, ie_result, download=True, extra_info=None):
1274 """
1275 Take the result of the ie(may be modified) and resolve all unresolved
1276 references (URLs, playlist items).
1277
1278 It will also download the videos if 'download'.
1279 Returns the resolved ie_result.
1280 """
1281 if extra_info is None:
1282 extra_info = {}
1283 result_type = ie_result.get('_type', 'video')
1284
1285 if result_type in ('url', 'url_transparent'):
1286 ie_result['url'] = sanitize_url(ie_result['url'])
1287 if ie_result.get('original_url'):
1288 extra_info.setdefault('original_url', ie_result['original_url'])
1289
1290 extract_flat = self.params.get('extract_flat', False)
1291 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1292 or extract_flat is True):
1293 info_copy = ie_result.copy()
1294 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1295 if not ie_result.get('id'):
1296 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1297 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1298 self.add_extra_info(info_copy, extra_info)
1299 self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
1300 if self.params.get('force_write_download_archive', False):
1301 self.record_download_archive(info_copy)
1302 return ie_result
1303
1304 if result_type == 'video':
1305 self.add_extra_info(ie_result, extra_info)
1306 ie_result = self.process_video_result(ie_result, download=download)
1307 additional_urls = (ie_result or {}).get('additional_urls')
1308 if additional_urls:
1309 # TODO: Improve MetadataParserPP to allow setting a list
1310 if isinstance(additional_urls, compat_str):
1311 additional_urls = [additional_urls]
1312 self.to_screen(
1313 '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls)))
1314 self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
1315 ie_result['additional_entries'] = [
1316 self.extract_info(
1317 url, download, extra_info,
1318 force_generic_extractor=self.params.get('force_generic_extractor'))
1319 for url in additional_urls
1320 ]
1321 return ie_result
1322 elif result_type == 'url':
1323 # We have to add extra_info to the results because it may be
1324 # contained in a playlist
1325 return self.extract_info(
1326 ie_result['url'], download,
1327 ie_key=ie_result.get('ie_key'),
1328 extra_info=extra_info)
1329 elif result_type == 'url_transparent':
1330 # Use the information from the embedding page
1331 info = self.extract_info(
1332 ie_result['url'], ie_key=ie_result.get('ie_key'),
1333 extra_info=extra_info, download=False, process=False)
1334
1335 # extract_info may return None when ignoreerrors is enabled and
1336 # extraction failed with an error, don't crash and return early
1337 # in this case
1338 if not info:
1339 return info
1340
1341 force_properties = dict(
1342 (k, v) for k, v in ie_result.items() if v is not None)
1343 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1344 if f in force_properties:
1345 del force_properties[f]
1346 new_result = info.copy()
1347 new_result.update(force_properties)
1348
1349 # Extracted info may not be a video result (i.e.
1350 # info.get('_type', 'video') != video) but rather an url or
1351 # url_transparent. In such cases outer metadata (from ie_result)
1352 # should be propagated to inner one (info). For this to happen
1353 # _type of info should be overridden with url_transparent. This
1354 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1355 if new_result.get('_type') == 'url':
1356 new_result['_type'] = 'url_transparent'
1357
1358 return self.process_ie_result(
1359 new_result, download=download, extra_info=extra_info)
1360 elif result_type in ('playlist', 'multi_video'):
1361 # Protect from infinite recursion due to recursively nested playlists
1362 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1363 webpage_url = ie_result['webpage_url']
1364 if webpage_url in self._playlist_urls:
1365 self.to_screen(
1366 '[download] Skipping already downloaded playlist: %s'
1367 % ie_result.get('title') or ie_result.get('id'))
1368 return
1369
1370 self._playlist_level += 1
1371 self._playlist_urls.add(webpage_url)
1372 self._sanitize_thumbnails(ie_result)
1373 try:
1374 return self.__process_playlist(ie_result, download)
1375 finally:
1376 self._playlist_level -= 1
1377 if not self._playlist_level:
1378 self._playlist_urls.clear()
1379 elif result_type == 'compat_list':
1380 self.report_warning(
1381 'Extractor %s returned a compat_list result. '
1382 'It needs to be updated.' % ie_result.get('extractor'))
1383
1384 def _fixup(r):
1385 self.add_extra_info(r, {
1386 'extractor': ie_result['extractor'],
1387 'webpage_url': ie_result['webpage_url'],
1388 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1389 'extractor_key': ie_result['extractor_key'],
1390 })
1391 return r
1392 ie_result['entries'] = [
1393 self.process_ie_result(_fixup(r), download, extra_info)
1394 for r in ie_result['entries']
1395 ]
1396 return ie_result
1397 else:
1398 raise Exception('Invalid result type: %s' % result_type)
1399
1400 def _ensure_dir_exists(self, path):
1401 return make_dir(path, self.report_error)
1402
1403 def __process_playlist(self, ie_result, download):
1404 # We process each entry in the playlist
1405 playlist = ie_result.get('title') or ie_result.get('id')
1406 self.to_screen('[download] Downloading playlist: %s' % playlist)
1407
1408 if 'entries' not in ie_result:
1409 raise EntryNotInPlaylist()
1410 incomplete_entries = bool(ie_result.get('requested_entries'))
1411 if incomplete_entries:
1412 def fill_missing_entries(entries, indexes):
1413 ret = [None] * max(*indexes)
1414 for i, entry in zip(indexes, entries):
1415 ret[i - 1] = entry
1416 return ret
1417 ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
1418
1419 playlist_results = []
1420
1421 playliststart = self.params.get('playliststart', 1)
1422 playlistend = self.params.get('playlistend')
1423 # For backwards compatibility, interpret -1 as whole list
1424 if playlistend == -1:
1425 playlistend = None
1426
1427 playlistitems_str = self.params.get('playlist_items')
1428 playlistitems = None
1429 if playlistitems_str is not None:
1430 def iter_playlistitems(format):
1431 for string_segment in format.split(','):
1432 if '-' in string_segment:
1433 start, end = string_segment.split('-')
1434 for item in range(int(start), int(end) + 1):
1435 yield int(item)
1436 else:
1437 yield int(string_segment)
1438 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1439
1440 ie_entries = ie_result['entries']
1441 msg = (
1442 'Downloading %d videos' if not isinstance(ie_entries, list)
1443 else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
1444
1445 if isinstance(ie_entries, list):
1446 def get_entry(i):
1447 return ie_entries[i - 1]
1448 else:
1449 if not isinstance(ie_entries, PagedList):
1450 ie_entries = LazyList(ie_entries)
1451
1452 def get_entry(i):
1453 return YoutubeDL.__handle_extraction_exceptions(
1454 lambda self, i: ie_entries[i - 1]
1455 )(self, i)
1456
1457 entries = []
1458 for i in playlistitems or itertools.count(playliststart):
1459 if playlistitems is None and playlistend is not None and playlistend < i:
1460 break
1461 entry = None
1462 try:
1463 entry = get_entry(i)
1464 if entry is None:
1465 raise EntryNotInPlaylist()
1466 except (IndexError, EntryNotInPlaylist):
1467 if incomplete_entries:
1468 raise EntryNotInPlaylist()
1469 elif not playlistitems:
1470 break
1471 entries.append(entry)
1472 try:
1473 if entry is not None:
1474 self._match_entry(entry, incomplete=True, silent=True)
1475 except (ExistingVideoReached, RejectedVideoReached):
1476 break
1477 ie_result['entries'] = entries
1478
1479 # Save playlist_index before re-ordering
1480 entries = [
1481 ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry)
1482 for i, entry in enumerate(entries, 1)
1483 if entry is not None]
1484 n_entries = len(entries)
1485
1486 if not playlistitems and (playliststart or playlistend):
1487 playlistitems = list(range(playliststart, playliststart + n_entries))
1488 ie_result['requested_entries'] = playlistitems
1489
1490 if self.params.get('allow_playlist_files', True):
1491 ie_copy = {
1492 'playlist': playlist,
1493 'playlist_id': ie_result.get('id'),
1494 'playlist_title': ie_result.get('title'),
1495 'playlist_uploader': ie_result.get('uploader'),
1496 'playlist_uploader_id': ie_result.get('uploader_id'),
1497 'playlist_index': 0,
1498 }
1499 ie_copy.update(dict(ie_result))
1500
1501 if self.params.get('writeinfojson', False):
1502 infofn = self.prepare_filename(ie_copy, 'pl_infojson')
1503 if not self._ensure_dir_exists(encodeFilename(infofn)):
1504 return
1505 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
1506 self.to_screen('[info] Playlist metadata is already present')
1507 else:
1508 self.to_screen('[info] Writing playlist metadata as JSON to: ' + infofn)
1509 try:
1510 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
1511 except (OSError, IOError):
1512 self.report_error('Cannot write playlist metadata to JSON file ' + infofn)
1513
1514 # TODO: This should be passed to ThumbnailsConvertor if necessary
1515 self._write_thumbnails(ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
1516
1517 if self.params.get('writedescription', False):
1518 descfn = self.prepare_filename(ie_copy, 'pl_description')
1519 if not self._ensure_dir_exists(encodeFilename(descfn)):
1520 return
1521 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
1522 self.to_screen('[info] Playlist description is already present')
1523 elif ie_result.get('description') is None:
1524 self.report_warning('There\'s no playlist description to write.')
1525 else:
1526 try:
1527 self.to_screen('[info] Writing playlist description to: ' + descfn)
1528 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1529 descfile.write(ie_result['description'])
1530 except (OSError, IOError):
1531 self.report_error('Cannot write playlist description file ' + descfn)
1532 return
1533
1534 if self.params.get('playlistreverse', False):
1535 entries = entries[::-1]
1536 if self.params.get('playlistrandom', False):
1537 random.shuffle(entries)
1538
1539 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1540
1541 self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries))
1542 failures = 0
1543 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
1544 for i, entry_tuple in enumerate(entries, 1):
1545 playlist_index, entry = entry_tuple
1546 if 'playlist-index' in self.params.get('compat_opts', []):
1547 playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1
1548 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1549 # This __x_forwarded_for_ip thing is a bit ugly but requires
1550 # minimal changes
1551 if x_forwarded_for:
1552 entry['__x_forwarded_for_ip'] = x_forwarded_for
1553 extra = {
1554 'n_entries': n_entries,
1555 '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
1556 'playlist_index': playlist_index,
1557 'playlist_autonumber': i,
1558 'playlist': playlist,
1559 'playlist_id': ie_result.get('id'),
1560 'playlist_title': ie_result.get('title'),
1561 'playlist_uploader': ie_result.get('uploader'),
1562 'playlist_uploader_id': ie_result.get('uploader_id'),
1563 'extractor': ie_result['extractor'],
1564 'webpage_url': ie_result['webpage_url'],
1565 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1566 'extractor_key': ie_result['extractor_key'],
1567 }
1568
1569 if self._match_entry(entry, incomplete=True) is not None:
1570 continue
1571
1572 entry_result = self.__process_iterable_entry(entry, download, extra)
1573 if not entry_result:
1574 failures += 1
1575 if failures >= max_failures:
1576 self.report_error(
1577 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
1578 break
1579 # TODO: skip failed (empty) entries?
1580 playlist_results.append(entry_result)
1581 ie_result['entries'] = playlist_results
1582 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1583 return ie_result
1584
1585 @__handle_extraction_exceptions
1586 def __process_iterable_entry(self, entry, download, extra_info):
1587 return self.process_ie_result(
1588 entry, download=download, extra_info=extra_info)
1589
1590 def _build_format_filter(self, filter_spec):
1591 " Returns a function to filter the formats according to the filter_spec "
1592
1593 OPERATORS = {
1594 '<': operator.lt,
1595 '<=': operator.le,
1596 '>': operator.gt,
1597 '>=': operator.ge,
1598 '=': operator.eq,
1599 '!=': operator.ne,
1600 }
1601 operator_rex = re.compile(r'''(?x)\s*
1602 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
1603 (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1604 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
1605 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1606 m = operator_rex.fullmatch(filter_spec)
1607 if m:
1608 try:
1609 comparison_value = int(m.group('value'))
1610 except ValueError:
1611 comparison_value = parse_filesize(m.group('value'))
1612 if comparison_value is None:
1613 comparison_value = parse_filesize(m.group('value') + 'B')
1614 if comparison_value is None:
1615 raise ValueError(
1616 'Invalid value %r in format specification %r' % (
1617 m.group('value'), filter_spec))
1618 op = OPERATORS[m.group('op')]
1619
1620 if not m:
1621 STR_OPERATORS = {
1622 '=': operator.eq,
1623 '^=': lambda attr, value: attr.startswith(value),
1624 '$=': lambda attr, value: attr.endswith(value),
1625 '*=': lambda attr, value: value in attr,
1626 }
1627 str_operator_rex = re.compile(r'''(?x)\s*
1628 (?P<key>[a-zA-Z0-9._-]+)\s*
1629 (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1630 (?P<value>[a-zA-Z0-9._-]+)\s*
1631 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1632 m = str_operator_rex.fullmatch(filter_spec)
1633 if m:
1634 comparison_value = m.group('value')
1635 str_op = STR_OPERATORS[m.group('op')]
1636 if m.group('negation'):
1637 op = lambda attr, value: not str_op(attr, value)
1638 else:
1639 op = str_op
1640
1641 if not m:
1642 raise SyntaxError('Invalid filter specification %r' % filter_spec)
1643
1644 def _filter(f):
1645 actual_value = f.get(m.group('key'))
1646 if actual_value is None:
1647 return m.group('none_inclusive')
1648 return op(actual_value, comparison_value)
1649 return _filter
1650
1651 def _default_format_spec(self, info_dict, download=True):
1652
1653 def can_merge():
1654 merger = FFmpegMergerPP(self)
1655 return merger.available and merger.can_merge()
1656
1657 prefer_best = (
1658 not self.params.get('simulate')
1659 and download
1660 and (
1661 not can_merge()
1662 or info_dict.get('is_live', False)
1663 or self.outtmpl_dict['default'] == '-'))
1664 compat = (
1665 prefer_best
1666 or self.params.get('allow_multiple_audio_streams', False)
1667 or 'format-spec' in self.params.get('compat_opts', []))
1668
1669 return (
1670 'best/bestvideo+bestaudio' if prefer_best
1671 else 'bestvideo*+bestaudio/best' if not compat
1672 else 'bestvideo+bestaudio/best')
1673
1674 def build_format_selector(self, format_spec):
1675 def syntax_error(note, start):
1676 message = (
1677 'Invalid format specification: '
1678 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1679 return SyntaxError(message)
1680
1681 PICKFIRST = 'PICKFIRST'
1682 MERGE = 'MERGE'
1683 SINGLE = 'SINGLE'
1684 GROUP = 'GROUP'
1685 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1686
1687 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1688 'video': self.params.get('allow_multiple_video_streams', False)}
1689
1690 check_formats = self.params.get('check_formats')
1691
1692 def _parse_filter(tokens):
1693 filter_parts = []
1694 for type, string, start, _, _ in tokens:
1695 if type == tokenize.OP and string == ']':
1696 return ''.join(filter_parts)
1697 else:
1698 filter_parts.append(string)
1699
1700 def _remove_unused_ops(tokens):
1701 # Remove operators that we don't use and join them with the surrounding strings
1702 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1703 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1704 last_string, last_start, last_end, last_line = None, None, None, None
1705 for type, string, start, end, line in tokens:
1706 if type == tokenize.OP and string == '[':
1707 if last_string:
1708 yield tokenize.NAME, last_string, last_start, last_end, last_line
1709 last_string = None
1710 yield type, string, start, end, line
1711 # everything inside brackets will be handled by _parse_filter
1712 for type, string, start, end, line in tokens:
1713 yield type, string, start, end, line
1714 if type == tokenize.OP and string == ']':
1715 break
1716 elif type == tokenize.OP and string in ALLOWED_OPS:
1717 if last_string:
1718 yield tokenize.NAME, last_string, last_start, last_end, last_line
1719 last_string = None
1720 yield type, string, start, end, line
1721 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1722 if not last_string:
1723 last_string = string
1724 last_start = start
1725 last_end = end
1726 else:
1727 last_string += string
1728 if last_string:
1729 yield tokenize.NAME, last_string, last_start, last_end, last_line
1730
1731 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1732 selectors = []
1733 current_selector = None
1734 for type, string, start, _, _ in tokens:
1735 # ENCODING is only defined in python 3.x
1736 if type == getattr(tokenize, 'ENCODING', None):
1737 continue
1738 elif type in [tokenize.NAME, tokenize.NUMBER]:
1739 current_selector = FormatSelector(SINGLE, string, [])
1740 elif type == tokenize.OP:
1741 if string == ')':
1742 if not inside_group:
1743 # ')' will be handled by the parentheses group
1744 tokens.restore_last_token()
1745 break
1746 elif inside_merge and string in ['/', ',']:
1747 tokens.restore_last_token()
1748 break
1749 elif inside_choice and string == ',':
1750 tokens.restore_last_token()
1751 break
1752 elif string == ',':
1753 if not current_selector:
1754 raise syntax_error('"," must follow a format selector', start)
1755 selectors.append(current_selector)
1756 current_selector = None
1757 elif string == '/':
1758 if not current_selector:
1759 raise syntax_error('"/" must follow a format selector', start)
1760 first_choice = current_selector
1761 second_choice = _parse_format_selection(tokens, inside_choice=True)
1762 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1763 elif string == '[':
1764 if not current_selector:
1765 current_selector = FormatSelector(SINGLE, 'best', [])
1766 format_filter = _parse_filter(tokens)
1767 current_selector.filters.append(format_filter)
1768 elif string == '(':
1769 if current_selector:
1770 raise syntax_error('Unexpected "("', start)
1771 group = _parse_format_selection(tokens, inside_group=True)
1772 current_selector = FormatSelector(GROUP, group, [])
1773 elif string == '+':
1774 if not current_selector:
1775 raise syntax_error('Unexpected "+"', start)
1776 selector_1 = current_selector
1777 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1778 if not selector_2:
1779 raise syntax_error('Expected a selector', start)
1780 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1781 else:
1782 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1783 elif type == tokenize.ENDMARKER:
1784 break
1785 if current_selector:
1786 selectors.append(current_selector)
1787 return selectors
1788
1789 def _merge(formats_pair):
1790 format_1, format_2 = formats_pair
1791
1792 formats_info = []
1793 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1794 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1795
1796 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1797 get_no_more = {'video': False, 'audio': False}
1798 for (i, fmt_info) in enumerate(formats_info):
1799 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
1800 formats_info.pop(i)
1801 continue
1802 for aud_vid in ['audio', 'video']:
1803 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1804 if get_no_more[aud_vid]:
1805 formats_info.pop(i)
1806 break
1807 get_no_more[aud_vid] = True
1808
1809 if len(formats_info) == 1:
1810 return formats_info[0]
1811
1812 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1813 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1814
1815 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1816 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1817
1818 output_ext = self.params.get('merge_output_format')
1819 if not output_ext:
1820 if the_only_video:
1821 output_ext = the_only_video['ext']
1822 elif the_only_audio and not video_fmts:
1823 output_ext = the_only_audio['ext']
1824 else:
1825 output_ext = 'mkv'
1826
1827 new_dict = {
1828 'requested_formats': formats_info,
1829 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1830 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1831 'ext': output_ext,
1832 }
1833
1834 if the_only_video:
1835 new_dict.update({
1836 'width': the_only_video.get('width'),
1837 'height': the_only_video.get('height'),
1838 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
1839 'fps': the_only_video.get('fps'),
1840 'vcodec': the_only_video.get('vcodec'),
1841 'vbr': the_only_video.get('vbr'),
1842 'stretched_ratio': the_only_video.get('stretched_ratio'),
1843 })
1844
1845 if the_only_audio:
1846 new_dict.update({
1847 'acodec': the_only_audio.get('acodec'),
1848 'abr': the_only_audio.get('abr'),
1849 })
1850
1851 return new_dict
1852
1853 def _check_formats(formats):
1854 if not check_formats:
1855 yield from formats
1856 return
1857 for f in formats:
1858 self.to_screen('[info] Testing format %s' % f['format_id'])
1859 temp_file = tempfile.NamedTemporaryFile(
1860 suffix='.tmp', delete=False,
1861 dir=self.get_output_path('temp') or None)
1862 temp_file.close()
1863 try:
1864 success, _ = self.dl(temp_file.name, f, test=True)
1865 except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
1866 success = False
1867 finally:
1868 if os.path.exists(temp_file.name):
1869 try:
1870 os.remove(temp_file.name)
1871 except OSError:
1872 self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
1873 if success:
1874 yield f
1875 else:
1876 self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
1877
1878 def _build_selector_function(selector):
1879 if isinstance(selector, list): # ,
1880 fs = [_build_selector_function(s) for s in selector]
1881
1882 def selector_function(ctx):
1883 for f in fs:
1884 yield from f(ctx)
1885 return selector_function
1886
1887 elif selector.type == GROUP: # ()
1888 selector_function = _build_selector_function(selector.selector)
1889
1890 elif selector.type == PICKFIRST: # /
1891 fs = [_build_selector_function(s) for s in selector.selector]
1892
1893 def selector_function(ctx):
1894 for f in fs:
1895 picked_formats = list(f(ctx))
1896 if picked_formats:
1897 return picked_formats
1898 return []
1899
1900 elif selector.type == MERGE: # +
1901 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1902
1903 def selector_function(ctx):
1904 for pair in itertools.product(
1905 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1906 yield _merge(pair)
1907
1908 elif selector.type == SINGLE: # atom
1909 format_spec = selector.selector or 'best'
1910
1911 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
1912 if format_spec == 'all':
1913 def selector_function(ctx):
1914 yield from _check_formats(ctx['formats'])
1915 elif format_spec == 'mergeall':
1916 def selector_function(ctx):
1917 formats = list(_check_formats(ctx['formats']))
1918 if not formats:
1919 return
1920 merged_format = formats[-1]
1921 for f in formats[-2::-1]:
1922 merged_format = _merge((merged_format, f))
1923 yield merged_format
1924
1925 else:
1926 format_fallback, format_reverse, format_idx = False, True, 1
1927 mobj = re.match(
1928 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
1929 format_spec)
1930 if mobj is not None:
1931 format_idx = int_or_none(mobj.group('n'), default=1)
1932 format_reverse = mobj.group('bw')[0] == 'b'
1933 format_type = (mobj.group('type') or [None])[0]
1934 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
1935 format_modified = mobj.group('mod') is not None
1936
1937 format_fallback = not format_type and not format_modified # for b, w
1938 _filter_f = (
1939 (lambda f: f.get('%scodec' % format_type) != 'none')
1940 if format_type and format_modified # bv*, ba*, wv*, wa*
1941 else (lambda f: f.get('%scodec' % not_format_type) == 'none')
1942 if format_type # bv, ba, wv, wa
1943 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1944 if not format_modified # b, w
1945 else lambda f: True) # b*, w*
1946 filter_f = lambda f: _filter_f(f) and (
1947 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
1948 else:
1949 filter_f = ((lambda f: f.get('ext') == format_spec)
1950 if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension
1951 else (lambda f: f.get('format_id') == format_spec)) # id
1952
1953 def selector_function(ctx):
1954 formats = list(ctx['formats'])
1955 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1956 if format_fallback and ctx['incomplete_formats'] and not matches:
1957 # for extractors with incomplete formats (audio only (soundcloud)
1958 # or video only (imgur)) best/worst will fallback to
1959 # best/worst {video,audio}-only format
1960 matches = formats
1961 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
1962 try:
1963 yield matches[format_idx - 1]
1964 except IndexError:
1965 return
1966
1967 filters = [self._build_format_filter(f) for f in selector.filters]
1968
1969 def final_selector(ctx):
1970 ctx_copy = copy.deepcopy(ctx)
1971 for _filter in filters:
1972 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1973 return selector_function(ctx_copy)
1974 return final_selector
1975
1976 stream = io.BytesIO(format_spec.encode('utf-8'))
1977 try:
1978 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1979 except tokenize.TokenError:
1980 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1981
1982 class TokenIterator(object):
1983 def __init__(self, tokens):
1984 self.tokens = tokens
1985 self.counter = 0
1986
1987 def __iter__(self):
1988 return self
1989
1990 def __next__(self):
1991 if self.counter >= len(self.tokens):
1992 raise StopIteration()
1993 value = self.tokens[self.counter]
1994 self.counter += 1
1995 return value
1996
1997 next = __next__
1998
1999 def restore_last_token(self):
2000 self.counter -= 1
2001
2002 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2003 return _build_selector_function(parsed_selector)
2004
2005 def _calc_headers(self, info_dict):
2006 res = std_headers.copy()
2007
2008 add_headers = info_dict.get('http_headers')
2009 if add_headers:
2010 res.update(add_headers)
2011
2012 cookies = self._calc_cookies(info_dict)
2013 if cookies:
2014 res['Cookie'] = cookies
2015
2016 if 'X-Forwarded-For' not in res:
2017 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2018 if x_forwarded_for_ip:
2019 res['X-Forwarded-For'] = x_forwarded_for_ip
2020
2021 return res
2022
2023 def _calc_cookies(self, info_dict):
2024 pr = sanitized_Request(info_dict['url'])
2025 self.cookiejar.add_cookie_header(pr)
2026 return pr.get_header('Cookie')
2027
2028 def _sanitize_thumbnails(self, info_dict):
2029 thumbnails = info_dict.get('thumbnails')
2030 if thumbnails is None:
2031 thumbnail = info_dict.get('thumbnail')
2032 if thumbnail:
2033 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2034 if thumbnails:
2035 thumbnails.sort(key=lambda t: (
2036 t.get('preference') if t.get('preference') is not None else -1,
2037 t.get('width') if t.get('width') is not None else -1,
2038 t.get('height') if t.get('height') is not None else -1,
2039 t.get('id') if t.get('id') is not None else '',
2040 t.get('url')))
2041
2042 def thumbnail_tester():
2043 if self.params.get('check_formats'):
2044 test_all = True
2045 to_screen = lambda msg: self.to_screen(f'[info] {msg}')
2046 else:
2047 test_all = False
2048 to_screen = self.write_debug
2049
2050 def test_thumbnail(t):
2051 if not test_all and not t.get('_test_url'):
2052 return True
2053 to_screen('Testing thumbnail %s' % t['id'])
2054 try:
2055 self.urlopen(HEADRequest(t['url']))
2056 except network_exceptions as err:
2057 to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % (
2058 t['id'], t['url'], error_to_compat_str(err)))
2059 return False
2060 return True
2061
2062 return test_thumbnail
2063
2064 for i, t in enumerate(thumbnails):
2065 if t.get('id') is None:
2066 t['id'] = '%d' % i
2067 if t.get('width') and t.get('height'):
2068 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2069 t['url'] = sanitize_url(t['url'])
2070
2071 if self.params.get('check_formats') is not False:
2072 info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse()
2073 else:
2074 info_dict['thumbnails'] = thumbnails
2075
2076 def process_video_result(self, info_dict, download=True):
2077 assert info_dict.get('_type', 'video') == 'video'
2078
2079 if 'id' not in info_dict:
2080 raise ExtractorError('Missing "id" field in extractor result')
2081 if 'title' not in info_dict:
2082 raise ExtractorError('Missing "title" field in extractor result',
2083 video_id=info_dict['id'], ie=info_dict['extractor'])
2084
2085 def report_force_conversion(field, field_not, conversion):
2086 self.report_warning(
2087 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
2088 % (field, field_not, conversion))
2089
2090 def sanitize_string_field(info, string_field):
2091 field = info.get(string_field)
2092 if field is None or isinstance(field, compat_str):
2093 return
2094 report_force_conversion(string_field, 'a string', 'string')
2095 info[string_field] = compat_str(field)
2096
2097 def sanitize_numeric_fields(info):
2098 for numeric_field in self._NUMERIC_FIELDS:
2099 field = info.get(numeric_field)
2100 if field is None or isinstance(field, compat_numeric_types):
2101 continue
2102 report_force_conversion(numeric_field, 'numeric', 'int')
2103 info[numeric_field] = int_or_none(field)
2104
2105 sanitize_string_field(info_dict, 'id')
2106 sanitize_numeric_fields(info_dict)
2107
2108 if 'playlist' not in info_dict:
2109 # It isn't part of a playlist
2110 info_dict['playlist'] = None
2111 info_dict['playlist_index'] = None
2112
2113 self._sanitize_thumbnails(info_dict)
2114
2115 thumbnail = info_dict.get('thumbnail')
2116 thumbnails = info_dict.get('thumbnails')
2117 if thumbnail:
2118 info_dict['thumbnail'] = sanitize_url(thumbnail)
2119 elif thumbnails:
2120 info_dict['thumbnail'] = thumbnails[-1]['url']
2121
2122 if info_dict.get('display_id') is None and 'id' in info_dict:
2123 info_dict['display_id'] = info_dict['id']
2124
2125 for ts_key, date_key in (
2126 ('timestamp', 'upload_date'),
2127 ('release_timestamp', 'release_date'),
2128 ):
2129 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2130 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2131 # see http://bugs.python.org/issue1646728)
2132 try:
2133 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
2134 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2135 except (ValueError, OverflowError, OSError):
2136 pass
2137
2138 live_keys = ('is_live', 'was_live')
2139 live_status = info_dict.get('live_status')
2140 if live_status is None:
2141 for key in live_keys:
2142 if info_dict.get(key) is False:
2143 continue
2144 if info_dict.get(key):
2145 live_status = key
2146 break
2147 if all(info_dict.get(key) is False for key in live_keys):
2148 live_status = 'not_live'
2149 if live_status:
2150 info_dict['live_status'] = live_status
2151 for key in live_keys:
2152 if info_dict.get(key) is None:
2153 info_dict[key] = (live_status == key)
2154
2155 # Auto generate title fields corresponding to the *_number fields when missing
2156 # in order to always have clean titles. This is very common for TV series.
2157 for field in ('chapter', 'season', 'episode'):
2158 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
2159 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
2160
2161 for cc_kind in ('subtitles', 'automatic_captions'):
2162 cc = info_dict.get(cc_kind)
2163 if cc:
2164 for _, subtitle in cc.items():
2165 for subtitle_format in subtitle:
2166 if subtitle_format.get('url'):
2167 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2168 if subtitle_format.get('ext') is None:
2169 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2170
2171 automatic_captions = info_dict.get('automatic_captions')
2172 subtitles = info_dict.get('subtitles')
2173
2174 info_dict['requested_subtitles'] = self.process_subtitles(
2175 info_dict['id'], subtitles, automatic_captions)
2176
2177 # We now pick which formats have to be downloaded
2178 if info_dict.get('formats') is None:
2179 # There's only one format available
2180 formats = [info_dict]
2181 else:
2182 formats = info_dict['formats']
2183
2184 info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
2185 if not self.params.get('allow_unplayable_formats'):
2186 formats = [f for f in formats if not f.get('has_drm')]
2187
2188 if not formats:
2189 self.raise_no_formats(info_dict)
2190
2191 def is_wellformed(f):
2192 url = f.get('url')
2193 if not url:
2194 self.report_warning(
2195 '"url" field is missing or empty - skipping format, '
2196 'there is an error in extractor')
2197 return False
2198 if isinstance(url, bytes):
2199 sanitize_string_field(f, 'url')
2200 return True
2201
2202 # Filter out malformed formats for better extraction robustness
2203 formats = list(filter(is_wellformed, formats))
2204
2205 formats_dict = {}
2206
2207 # We check that all the formats have the format and format_id fields
2208 for i, format in enumerate(formats):
2209 sanitize_string_field(format, 'format_id')
2210 sanitize_numeric_fields(format)
2211 format['url'] = sanitize_url(format['url'])
2212 if not format.get('format_id'):
2213 format['format_id'] = compat_str(i)
2214 else:
2215 # Sanitize format_id from characters used in format selector expression
2216 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
2217 format_id = format['format_id']
2218 if format_id not in formats_dict:
2219 formats_dict[format_id] = []
2220 formats_dict[format_id].append(format)
2221
2222 # Make sure all formats have unique format_id
2223 for format_id, ambiguous_formats in formats_dict.items():
2224 if len(ambiguous_formats) > 1:
2225 for i, format in enumerate(ambiguous_formats):
2226 format['format_id'] = '%s-%d' % (format_id, i)
2227
2228 for i, format in enumerate(formats):
2229 if format.get('format') is None:
2230 format['format'] = '{id} - {res}{note}'.format(
2231 id=format['format_id'],
2232 res=self.format_resolution(format),
2233 note=format_field(format, 'format_note', ' (%s)'),
2234 )
2235 # Automatically determine file extension if missing
2236 if format.get('ext') is None:
2237 format['ext'] = determine_ext(format['url']).lower()
2238 # Automatically determine protocol if missing (useful for format
2239 # selection purposes)
2240 if format.get('protocol') is None:
2241 format['protocol'] = determine_protocol(format)
2242 # Add HTTP headers, so that external programs can use them from the
2243 # json output
2244 full_format_info = info_dict.copy()
2245 full_format_info.update(format)
2246 format['http_headers'] = self._calc_headers(full_format_info)
2247 # Remove private housekeeping stuff
2248 if '__x_forwarded_for_ip' in info_dict:
2249 del info_dict['__x_forwarded_for_ip']
2250
2251 # TODO Central sorting goes here
2252
2253 if not formats or formats[0] is not info_dict:
2254 # only set the 'formats' fields if the original info_dict list them
2255 # otherwise we end up with a circular reference, the first (and unique)
2256 # element in the 'formats' field in info_dict is info_dict itself,
2257 # which can't be exported to json
2258 info_dict['formats'] = formats
2259
2260 info_dict, _ = self.pre_process(info_dict)
2261
2262 if self.params.get('list_thumbnails'):
2263 self.list_thumbnails(info_dict)
2264 if self.params.get('listformats'):
2265 if not info_dict.get('formats') and not info_dict.get('url'):
2266 self.to_screen('%s has no formats' % info_dict['id'])
2267 else:
2268 self.list_formats(info_dict)
2269 if self.params.get('listsubtitles'):
2270 if 'automatic_captions' in info_dict:
2271 self.list_subtitles(
2272 info_dict['id'], automatic_captions, 'automatic captions')
2273 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2274 list_only = self.params.get('simulate') is None and (
2275 self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
2276 if list_only:
2277 # Without this printing, -F --print-json will not work
2278 self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
2279 return
2280
2281 format_selector = self.format_selector
2282 if format_selector is None:
2283 req_format = self._default_format_spec(info_dict, download=download)
2284 self.write_debug('Default format spec: %s' % req_format)
2285 format_selector = self.build_format_selector(req_format)
2286
2287 # While in format selection we may need to have an access to the original
2288 # format set in order to calculate some metrics or do some processing.
2289 # For now we need to be able to guess whether original formats provided
2290 # by extractor are incomplete or not (i.e. whether extractor provides only
2291 # video-only or audio-only formats) for proper formats selection for
2292 # extractors with such incomplete formats (see
2293 # https://github.com/ytdl-org/youtube-dl/pull/5556).
2294 # Since formats may be filtered during format selection and may not match
2295 # the original formats the results may be incorrect. Thus original formats
2296 # or pre-calculated metrics should be passed to format selection routines
2297 # as well.
2298 # We will pass a context object containing all necessary additional data
2299 # instead of just formats.
2300 # This fixes incorrect format selection issue (see
2301 # https://github.com/ytdl-org/youtube-dl/issues/10083).
2302 incomplete_formats = (
2303 # All formats are video-only or
2304 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
2305 # all formats are audio-only
2306 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
2307
2308 ctx = {
2309 'formats': formats,
2310 'incomplete_formats': incomplete_formats,
2311 }
2312
2313 formats_to_download = list(format_selector(ctx))
2314 if not formats_to_download:
2315 if not self.params.get('ignore_no_formats_error'):
2316 raise ExtractorError('Requested format is not available', expected=True,
2317 video_id=info_dict['id'], ie=info_dict['extractor'])
2318 else:
2319 self.report_warning('Requested format is not available')
2320 # Process what we can, even without any available formats.
2321 self.process_info(dict(info_dict))
2322 elif download:
2323 self.to_screen(
2324 '[info] %s: Downloading %d format(s): %s' % (
2325 info_dict['id'], len(formats_to_download),
2326 ", ".join([f['format_id'] for f in formats_to_download])))
2327 for fmt in formats_to_download:
2328 new_info = dict(info_dict)
2329 # Save a reference to the original info_dict so that it can be modified in process_info if needed
2330 new_info['__original_infodict'] = info_dict
2331 new_info.update(fmt)
2332 self.process_info(new_info)
2333 # We update the info dict with the best quality format (backwards compatibility)
2334 if formats_to_download:
2335 info_dict.update(formats_to_download[-1])
2336 return info_dict
2337
2338 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
2339 """Select the requested subtitles and their format"""
2340 available_subs = {}
2341 if normal_subtitles and self.params.get('writesubtitles'):
2342 available_subs.update(normal_subtitles)
2343 if automatic_captions and self.params.get('writeautomaticsub'):
2344 for lang, cap_info in automatic_captions.items():
2345 if lang not in available_subs:
2346 available_subs[lang] = cap_info
2347
2348 if (not self.params.get('writesubtitles') and not
2349 self.params.get('writeautomaticsub') or not
2350 available_subs):
2351 return None
2352
2353 all_sub_langs = available_subs.keys()
2354 if self.params.get('allsubtitles', False):
2355 requested_langs = all_sub_langs
2356 elif self.params.get('subtitleslangs', False):
2357 requested_langs = set()
2358 for lang in self.params.get('subtitleslangs'):
2359 if lang == 'all':
2360 requested_langs.update(all_sub_langs)
2361 continue
2362 discard = lang[0] == '-'
2363 if discard:
2364 lang = lang[1:]
2365 current_langs = filter(re.compile(lang + '$').match, all_sub_langs)
2366 if discard:
2367 for lang in current_langs:
2368 requested_langs.discard(lang)
2369 else:
2370 requested_langs.update(current_langs)
2371 elif 'en' in available_subs:
2372 requested_langs = ['en']
2373 else:
2374 requested_langs = [list(all_sub_langs)[0]]
2375 if requested_langs:
2376 self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs))
2377
2378 formats_query = self.params.get('subtitlesformat', 'best')
2379 formats_preference = formats_query.split('/') if formats_query else []
2380 subs = {}
2381 for lang in requested_langs:
2382 formats = available_subs.get(lang)
2383 if formats is None:
2384 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2385 continue
2386 for ext in formats_preference:
2387 if ext == 'best':
2388 f = formats[-1]
2389 break
2390 matches = list(filter(lambda f: f['ext'] == ext, formats))
2391 if matches:
2392 f = matches[-1]
2393 break
2394 else:
2395 f = formats[-1]
2396 self.report_warning(
2397 'No subtitle format found matching "%s" for language %s, '
2398 'using %s' % (formats_query, lang, f['ext']))
2399 subs[lang] = f
2400 return subs
2401
2402 def __forced_printings(self, info_dict, filename, incomplete):
2403 def print_mandatory(field, actual_field=None):
2404 if actual_field is None:
2405 actual_field = field
2406 if (self.params.get('force%s' % field, False)
2407 and (not incomplete or info_dict.get(actual_field) is not None)):
2408 self.to_stdout(info_dict[actual_field])
2409
2410 def print_optional(field):
2411 if (self.params.get('force%s' % field, False)
2412 and info_dict.get(field) is not None):
2413 self.to_stdout(info_dict[field])
2414
2415 info_dict = info_dict.copy()
2416 if filename is not None:
2417 info_dict['filename'] = filename
2418 if info_dict.get('requested_formats') is not None:
2419 # For RTMP URLs, also include the playpath
2420 info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
2421 elif 'url' in info_dict:
2422 info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
2423
2424 if self.params.get('forceprint') or self.params.get('forcejson'):
2425 self.post_extract(info_dict)
2426 for tmpl in self.params.get('forceprint', []):
2427 if re.match(r'\w+$', tmpl):
2428 tmpl = '%({})s'.format(tmpl)
2429 tmpl, info_copy = self.prepare_outtmpl(tmpl, info_dict)
2430 self.to_stdout(self.escape_outtmpl(tmpl) % info_copy)
2431
2432 print_mandatory('title')
2433 print_mandatory('id')
2434 print_mandatory('url', 'urls')
2435 print_optional('thumbnail')
2436 print_optional('description')
2437 print_optional('filename')
2438 if self.params.get('forceduration') and info_dict.get('duration') is not None:
2439 self.to_stdout(formatSeconds(info_dict['duration']))
2440 print_mandatory('format')
2441
2442 if self.params.get('forcejson'):
2443 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
2444
2445 def dl(self, name, info, subtitle=False, test=False):
2446 if not info.get('url'):
2447 self.raise_no_formats(info, True)
2448
2449 if test:
2450 verbose = self.params.get('verbose')
2451 params = {
2452 'test': True,
2453 'quiet': not verbose,
2454 'verbose': verbose,
2455 'noprogress': not verbose,
2456 'nopart': True,
2457 'skip_unavailable_fragments': False,
2458 'keep_fragments': False,
2459 'overwrites': True,
2460 '_no_ytdl_file': True,
2461 }
2462 else:
2463 params = self.params
2464 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
2465 if not test:
2466 for ph in self._progress_hooks:
2467 fd.add_progress_hook(ph)
2468 urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
2469 self.write_debug('Invoking downloader on "%s"' % urls)
2470 new_info = dict(info)
2471 if new_info.get('http_headers') is None:
2472 new_info['http_headers'] = self._calc_headers(new_info)
2473 return fd.download(name, new_info, subtitle)
2474
2475 def process_info(self, info_dict):
2476 """Process a single resolved IE result."""
2477
2478 assert info_dict.get('_type', 'video') == 'video'
2479
2480 max_downloads = self.params.get('max_downloads')
2481 if max_downloads is not None:
2482 if self._num_downloads >= int(max_downloads):
2483 raise MaxDownloadsReached()
2484
2485 # TODO: backward compatibility, to be removed
2486 info_dict['fulltitle'] = info_dict['title']
2487
2488 if 'format' not in info_dict and 'ext' in info_dict:
2489 info_dict['format'] = info_dict['ext']
2490
2491 if self._match_entry(info_dict) is not None:
2492 return
2493
2494 self.post_extract(info_dict)
2495 self._num_downloads += 1
2496
2497 # info_dict['_filename'] needs to be set for backward compatibility
2498 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2499 temp_filename = self.prepare_filename(info_dict, 'temp')
2500 files_to_move = {}
2501
2502 # Forced printings
2503 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
2504
2505 if self.params.get('simulate'):
2506 if self.params.get('force_write_download_archive', False):
2507 self.record_download_archive(info_dict)
2508
2509 # Do nothing else if in simulate mode
2510 return
2511
2512 if full_filename is None:
2513 return
2514
2515 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2516 return
2517 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2518 return
2519
2520 if self.params.get('writedescription', False):
2521 descfn = self.prepare_filename(info_dict, 'description')
2522 if not self._ensure_dir_exists(encodeFilename(descfn)):
2523 return
2524 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
2525 self.to_screen('[info] Video description is already present')
2526 elif info_dict.get('description') is None:
2527 self.report_warning('There\'s no description to write.')
2528 else:
2529 try:
2530 self.to_screen('[info] Writing video description to: ' + descfn)
2531 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
2532 descfile.write(info_dict['description'])
2533 except (OSError, IOError):
2534 self.report_error('Cannot write description file ' + descfn)
2535 return
2536
2537 if self.params.get('writeannotations', False):
2538 annofn = self.prepare_filename(info_dict, 'annotation')
2539 if not self._ensure_dir_exists(encodeFilename(annofn)):
2540 return
2541 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2542 self.to_screen('[info] Video annotations are already present')
2543 elif not info_dict.get('annotations'):
2544 self.report_warning('There are no annotations to write.')
2545 else:
2546 try:
2547 self.to_screen('[info] Writing video annotations to: ' + annofn)
2548 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2549 annofile.write(info_dict['annotations'])
2550 except (KeyError, TypeError):
2551 self.report_warning('There are no annotations to write.')
2552 except (OSError, IOError):
2553 self.report_error('Cannot write annotations file: ' + annofn)
2554 return
2555
2556 subtitles_are_requested = any([self.params.get('writesubtitles', False),
2557 self.params.get('writeautomaticsub')])
2558
2559 if subtitles_are_requested and info_dict.get('requested_subtitles'):
2560 # subtitles download errors are already managed as troubles in relevant IE
2561 # that way it will silently go on when used with unsupporting IE
2562 subtitles = info_dict['requested_subtitles']
2563 # ie = self.get_info_extractor(info_dict['extractor_key'])
2564 for sub_lang, sub_info in subtitles.items():
2565 sub_format = sub_info['ext']
2566 sub_filename = subtitles_filename(temp_filename, sub_lang, sub_format, info_dict.get('ext'))
2567 sub_filename_final = subtitles_filename(
2568 self.prepare_filename(info_dict, 'subtitle'), sub_lang, sub_format, info_dict.get('ext'))
2569 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(sub_filename)):
2570 self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
2571 sub_info['filepath'] = sub_filename
2572 files_to_move[sub_filename] = sub_filename_final
2573 else:
2574 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
2575 if sub_info.get('data') is not None:
2576 try:
2577 # Use newline='' to prevent conversion of newline characters
2578 # See https://github.com/ytdl-org/youtube-dl/issues/10268
2579 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
2580 subfile.write(sub_info['data'])
2581 sub_info['filepath'] = sub_filename
2582 files_to_move[sub_filename] = sub_filename_final
2583 except (OSError, IOError):
2584 self.report_error('Cannot write subtitles file ' + sub_filename)
2585 return
2586 else:
2587 try:
2588 self.dl(sub_filename, sub_info.copy(), subtitle=True)
2589 sub_info['filepath'] = sub_filename
2590 files_to_move[sub_filename] = sub_filename_final
2591 except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
2592 self.report_warning('Unable to download subtitle for "%s": %s' %
2593 (sub_lang, error_to_compat_str(err)))
2594 continue
2595
2596 if self.params.get('writeinfojson', False):
2597 infofn = self.prepare_filename(info_dict, 'infojson')
2598 if not self._ensure_dir_exists(encodeFilename(infofn)):
2599 return
2600 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
2601 self.to_screen('[info] Video metadata is already present')
2602 else:
2603 self.to_screen('[info] Writing video metadata as JSON to: ' + infofn)
2604 try:
2605 write_json_file(self.sanitize_info(info_dict, self.params.get('clean_infojson', True)), infofn)
2606 except (OSError, IOError):
2607 self.report_error('Cannot write video metadata to JSON file ' + infofn)
2608 return
2609 info_dict['__infojson_filename'] = infofn
2610
2611 for thumb_ext in self._write_thumbnails(info_dict, temp_filename):
2612 thumb_filename_temp = replace_extension(temp_filename, thumb_ext, info_dict.get('ext'))
2613 thumb_filename = replace_extension(
2614 self.prepare_filename(info_dict, 'thumbnail'), thumb_ext, info_dict.get('ext'))
2615 files_to_move[thumb_filename_temp] = thumb_filename
2616
2617 # Write internet shortcut files
2618 url_link = webloc_link = desktop_link = False
2619 if self.params.get('writelink', False):
2620 if sys.platform == "darwin": # macOS.
2621 webloc_link = True
2622 elif sys.platform.startswith("linux"):
2623 desktop_link = True
2624 else: # if sys.platform in ['win32', 'cygwin']:
2625 url_link = True
2626 if self.params.get('writeurllink', False):
2627 url_link = True
2628 if self.params.get('writewebloclink', False):
2629 webloc_link = True
2630 if self.params.get('writedesktoplink', False):
2631 desktop_link = True
2632
2633 if url_link or webloc_link or desktop_link:
2634 if 'webpage_url' not in info_dict:
2635 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2636 return
2637 ascii_url = iri_to_uri(info_dict['webpage_url'])
2638
2639 def _write_link_file(extension, template, newline, embed_filename):
2640 linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
2641 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2642 self.to_screen('[info] Internet shortcut is already present')
2643 else:
2644 try:
2645 self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
2646 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
2647 template_vars = {'url': ascii_url}
2648 if embed_filename:
2649 template_vars['filename'] = linkfn[:-(len(extension) + 1)]
2650 linkfile.write(template % template_vars)
2651 except (OSError, IOError):
2652 self.report_error('Cannot write internet shortcut ' + linkfn)
2653 return False
2654 return True
2655
2656 if url_link:
2657 if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
2658 return
2659 if webloc_link:
2660 if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
2661 return
2662 if desktop_link:
2663 if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
2664 return
2665
2666 try:
2667 info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
2668 except PostProcessingError as err:
2669 self.report_error('Preprocessing: %s' % str(err))
2670 return
2671
2672 must_record_download_archive = False
2673 if self.params.get('skip_download', False):
2674 info_dict['filepath'] = temp_filename
2675 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2676 info_dict['__files_to_move'] = files_to_move
2677 info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
2678 else:
2679 # Download
2680 info_dict.setdefault('__postprocessors', [])
2681 try:
2682
2683 def existing_file(*filepaths):
2684 ext = info_dict.get('ext')
2685 final_ext = self.params.get('final_ext', ext)
2686 existing_files = []
2687 for file in orderedSet(filepaths):
2688 if final_ext != ext:
2689 converted = replace_extension(file, final_ext, ext)
2690 if os.path.exists(encodeFilename(converted)):
2691 existing_files.append(converted)
2692 if os.path.exists(encodeFilename(file)):
2693 existing_files.append(file)
2694
2695 if not existing_files or self.params.get('overwrites', False):
2696 for file in orderedSet(existing_files):
2697 self.report_file_delete(file)
2698 os.remove(encodeFilename(file))
2699 return None
2700
2701 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2702 return existing_files[0]
2703
2704 success = True
2705 if info_dict.get('requested_formats') is not None:
2706
2707 def compatible_formats(formats):
2708 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2709 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2710 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2711 if len(video_formats) > 2 or len(audio_formats) > 2:
2712 return False
2713
2714 # Check extension
2715 exts = set(format.get('ext') for format in formats)
2716 COMPATIBLE_EXTS = (
2717 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2718 set(('webm',)),
2719 )
2720 for ext_sets in COMPATIBLE_EXTS:
2721 if ext_sets.issuperset(exts):
2722 return True
2723 # TODO: Check acodec/vcodec
2724 return False
2725
2726 requested_formats = info_dict['requested_formats']
2727 old_ext = info_dict['ext']
2728 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2729 info_dict['ext'] = 'mkv'
2730 self.report_warning(
2731 'Requested formats are incompatible for merge and will be merged into mkv.')
2732 new_ext = info_dict['ext']
2733
2734 def correct_ext(filename, ext=new_ext):
2735 if filename == '-':
2736 return filename
2737 filename_real_ext = os.path.splitext(filename)[1][1:]
2738 filename_wo_ext = (
2739 os.path.splitext(filename)[0]
2740 if filename_real_ext in (old_ext, new_ext)
2741 else filename)
2742 return '%s.%s' % (filename_wo_ext, ext)
2743
2744 # Ensure filename always has a correct extension for successful merge
2745 full_filename = correct_ext(full_filename)
2746 temp_filename = correct_ext(temp_filename)
2747 dl_filename = existing_file(full_filename, temp_filename)
2748 info_dict['__real_download'] = False
2749
2750 _protocols = set(determine_protocol(f) for f in requested_formats)
2751 if len(_protocols) == 1: # All requested formats have same protocol
2752 info_dict['protocol'] = _protocols.pop()
2753 directly_mergable = FFmpegFD.can_merge_formats(info_dict)
2754 if dl_filename is not None:
2755 self.report_file_already_downloaded(dl_filename)
2756 elif (directly_mergable and get_suitable_downloader(
2757 info_dict, self.params, to_stdout=(temp_filename == '-')) == FFmpegFD):
2758 info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
2759 success, real_download = self.dl(temp_filename, info_dict)
2760 info_dict['__real_download'] = real_download
2761 else:
2762 downloaded = []
2763 merger = FFmpegMergerPP(self)
2764 if self.params.get('allow_unplayable_formats'):
2765 self.report_warning(
2766 'You have requested merging of multiple formats '
2767 'while also allowing unplayable formats to be downloaded. '
2768 'The formats won\'t be merged to prevent data corruption.')
2769 elif not merger.available:
2770 self.report_warning(
2771 'You have requested merging of multiple formats but ffmpeg is not installed. '
2772 'The formats won\'t be merged.')
2773
2774 if temp_filename == '-':
2775 reason = ('using a downloader other than ffmpeg' if directly_mergable
2776 else 'but the formats are incompatible for simultaneous download' if merger.available
2777 else 'but ffmpeg is not installed')
2778 self.report_warning(
2779 f'You have requested downloading multiple formats to stdout {reason}. '
2780 'The formats will be streamed one after the other')
2781 fname = temp_filename
2782 for f in requested_formats:
2783 new_info = dict(info_dict)
2784 del new_info['requested_formats']
2785 new_info.update(f)
2786 if temp_filename != '-':
2787 fname = prepend_extension(
2788 correct_ext(temp_filename, new_info['ext']),
2789 'f%s' % f['format_id'], new_info['ext'])
2790 if not self._ensure_dir_exists(fname):
2791 return
2792 downloaded.append(fname)
2793 partial_success, real_download = self.dl(fname, new_info)
2794 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2795 success = success and partial_success
2796 if merger.available and not self.params.get('allow_unplayable_formats'):
2797 info_dict['__postprocessors'].append(merger)
2798 info_dict['__files_to_merge'] = downloaded
2799 # Even if there were no downloads, it is being merged only now
2800 info_dict['__real_download'] = True
2801 else:
2802 for file in downloaded:
2803 files_to_move[file] = None
2804 else:
2805 # Just a single file
2806 dl_filename = existing_file(full_filename, temp_filename)
2807 if dl_filename is None or dl_filename == temp_filename:
2808 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
2809 # So we should try to resume the download
2810 success, real_download = self.dl(temp_filename, info_dict)
2811 info_dict['__real_download'] = real_download
2812 else:
2813 self.report_file_already_downloaded(dl_filename)
2814
2815 dl_filename = dl_filename or temp_filename
2816 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2817
2818 except network_exceptions as err:
2819 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2820 return
2821 except (OSError, IOError) as err:
2822 raise UnavailableVideoError(err)
2823 except (ContentTooShortError, ) as err:
2824 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2825 return
2826
2827 if success and full_filename != '-':
2828
2829 def fixup():
2830 do_fixup = True
2831 fixup_policy = self.params.get('fixup')
2832 vid = info_dict['id']
2833
2834 if fixup_policy in ('ignore', 'never'):
2835 return
2836 elif fixup_policy == 'warn':
2837 do_fixup = False
2838 elif fixup_policy != 'force':
2839 assert fixup_policy in ('detect_or_warn', None)
2840 if not info_dict.get('__real_download'):
2841 do_fixup = False
2842
2843 def ffmpeg_fixup(cndn, msg, cls):
2844 if not cndn:
2845 return
2846 if not do_fixup:
2847 self.report_warning(f'{vid}: {msg}')
2848 return
2849 pp = cls(self)
2850 if pp.available:
2851 info_dict['__postprocessors'].append(pp)
2852 else:
2853 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
2854
2855 stretched_ratio = info_dict.get('stretched_ratio')
2856 ffmpeg_fixup(
2857 stretched_ratio not in (1, None),
2858 f'Non-uniform pixel ratio {stretched_ratio}',
2859 FFmpegFixupStretchedPP)
2860
2861 ffmpeg_fixup(
2862 (info_dict.get('requested_formats') is None
2863 and info_dict.get('container') == 'm4a_dash'
2864 and info_dict.get('ext') == 'm4a'),
2865 'writing DASH m4a. Only some players support this container',
2866 FFmpegFixupM4aPP)
2867
2868 downloader = (get_suitable_downloader(info_dict, self.params).__name__
2869 if 'protocol' in info_dict else None)
2870 ffmpeg_fixup(downloader == 'HlsFD', 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
2871 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
2872 ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
2873
2874 fixup()
2875 try:
2876 info_dict = self.post_process(dl_filename, info_dict, files_to_move)
2877 except PostProcessingError as err:
2878 self.report_error('Postprocessing: %s' % str(err))
2879 return
2880 try:
2881 for ph in self._post_hooks:
2882 ph(info_dict['filepath'])
2883 except Exception as err:
2884 self.report_error('post hooks: %s' % str(err))
2885 return
2886 must_record_download_archive = True
2887
2888 if must_record_download_archive or self.params.get('force_write_download_archive', False):
2889 self.record_download_archive(info_dict)
2890 max_downloads = self.params.get('max_downloads')
2891 if max_downloads is not None and self._num_downloads >= int(max_downloads):
2892 raise MaxDownloadsReached()
2893
2894 def download(self, url_list):
2895 """Download a given list of URLs."""
2896 outtmpl = self.outtmpl_dict['default']
2897 if (len(url_list) > 1
2898 and outtmpl != '-'
2899 and '%' not in outtmpl
2900 and self.params.get('max_downloads') != 1):
2901 raise SameFileError(outtmpl)
2902
2903 for url in url_list:
2904 try:
2905 # It also downloads the videos
2906 res = self.extract_info(
2907 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2908 except UnavailableVideoError:
2909 self.report_error('unable to download video')
2910 except MaxDownloadsReached:
2911 self.to_screen('[info] Maximum number of downloads reached')
2912 raise
2913 except ExistingVideoReached:
2914 self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing')
2915 raise
2916 except RejectedVideoReached:
2917 self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject')
2918 raise
2919 else:
2920 if self.params.get('dump_single_json', False):
2921 self.post_extract(res)
2922 self.to_stdout(json.dumps(self.sanitize_info(res)))
2923
2924 return self._download_retcode
2925
2926 def download_with_info_file(self, info_filename):
2927 with contextlib.closing(fileinput.FileInput(
2928 [info_filename], mode='r',
2929 openhook=fileinput.hook_encoded('utf-8'))) as f:
2930 # FileInput doesn't have a read method, we can't call json.load
2931 info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
2932 try:
2933 self.process_ie_result(info, download=True)
2934 except (DownloadError, EntryNotInPlaylist, ThrottledDownload):
2935 webpage_url = info.get('webpage_url')
2936 if webpage_url is not None:
2937 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2938 return self.download([webpage_url])
2939 else:
2940 raise
2941 return self._download_retcode
2942
2943 @staticmethod
2944 def sanitize_info(info_dict, remove_private_keys=False):
2945 ''' Sanitize the infodict for converting to json '''
2946 if info_dict is None:
2947 return info_dict
2948 info_dict.setdefault('epoch', int(time.time()))
2949 remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
2950 keep_keys = ['_type'], # Always keep this to facilitate load-info-json
2951 if remove_private_keys:
2952 remove_keys |= {
2953 'requested_formats', 'requested_subtitles', 'requested_entries',
2954 'filepath', 'entries', 'original_url', 'playlist_autonumber',
2955 }
2956 empty_values = (None, {}, [], set(), tuple())
2957 reject = lambda k, v: k not in keep_keys and (
2958 k.startswith('_') or k in remove_keys or v in empty_values)
2959 else:
2960 reject = lambda k, v: k in remove_keys
2961 filter_fn = lambda obj: (
2962 list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
2963 else obj if not isinstance(obj, dict)
2964 else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
2965 return filter_fn(info_dict)
2966
2967 @staticmethod
2968 def filter_requested_info(info_dict, actually_filter=True):
2969 ''' Alias of sanitize_info for backward compatibility '''
2970 return YoutubeDL.sanitize_info(info_dict, actually_filter)
2971
2972 def run_pp(self, pp, infodict):
2973 files_to_delete = []
2974 if '__files_to_move' not in infodict:
2975 infodict['__files_to_move'] = {}
2976 files_to_delete, infodict = pp.run(infodict)
2977 if not files_to_delete:
2978 return infodict
2979
2980 if self.params.get('keepvideo', False):
2981 for f in files_to_delete:
2982 infodict['__files_to_move'].setdefault(f, '')
2983 else:
2984 for old_filename in set(files_to_delete):
2985 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2986 try:
2987 os.remove(encodeFilename(old_filename))
2988 except (IOError, OSError):
2989 self.report_warning('Unable to remove downloaded original file')
2990 if old_filename in infodict['__files_to_move']:
2991 del infodict['__files_to_move'][old_filename]
2992 return infodict
2993
2994 @staticmethod
2995 def post_extract(info_dict):
2996 def actual_post_extract(info_dict):
2997 if info_dict.get('_type') in ('playlist', 'multi_video'):
2998 for video_dict in info_dict.get('entries', {}):
2999 actual_post_extract(video_dict or {})
3000 return
3001
3002 post_extractor = info_dict.get('__post_extractor') or (lambda: {})
3003 extra = post_extractor().items()
3004 info_dict.update(extra)
3005 info_dict.pop('__post_extractor', None)
3006
3007 original_infodict = info_dict.get('__original_infodict') or {}
3008 original_infodict.update(extra)
3009 original_infodict.pop('__post_extractor', None)
3010
3011 actual_post_extract(info_dict or {})
3012
3013 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3014 info = dict(ie_info)
3015 info['__files_to_move'] = files_to_move or {}
3016 for pp in self._pps[key]:
3017 info = self.run_pp(pp, info)
3018 return info, info.pop('__files_to_move', None)
3019
3020 def post_process(self, filename, ie_info, files_to_move=None):
3021 """Run all the postprocessors on the given file."""
3022 info = dict(ie_info)
3023 info['filepath'] = filename
3024 info['__files_to_move'] = files_to_move or {}
3025
3026 for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
3027 info = self.run_pp(pp, info)
3028 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3029 del info['__files_to_move']
3030 for pp in self._pps['after_move']:
3031 info = self.run_pp(pp, info)
3032 return info
3033
3034 def _make_archive_id(self, info_dict):
3035 video_id = info_dict.get('id')
3036 if not video_id:
3037 return
3038 # Future-proof against any change in case
3039 # and backwards compatibility with prior versions
3040 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3041 if extractor is None:
3042 url = str_or_none(info_dict.get('url'))
3043 if not url:
3044 return
3045 # Try to find matching extractor for the URL and take its ie_key
3046 for ie_key, ie in self._ies.items():
3047 if ie.suitable(url):
3048 extractor = ie_key
3049 break
3050 else:
3051 return
3052 return '%s %s' % (extractor.lower(), video_id)
3053
3054 def in_download_archive(self, info_dict):
3055 fn = self.params.get('download_archive')
3056 if fn is None:
3057 return False
3058
3059 vid_id = self._make_archive_id(info_dict)
3060 if not vid_id:
3061 return False # Incomplete video information
3062
3063 return vid_id in self.archive
3064
3065 def record_download_archive(self, info_dict):
3066 fn = self.params.get('download_archive')
3067 if fn is None:
3068 return
3069 vid_id = self._make_archive_id(info_dict)
3070 assert vid_id
3071 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3072 archive_file.write(vid_id + '\n')
3073 self.archive.add(vid_id)
3074
3075 @staticmethod
3076 def format_resolution(format, default='unknown'):
3077 if format.get('vcodec') == 'none':
3078 if format.get('acodec') == 'none':
3079 return 'images'
3080 return 'audio only'
3081 if format.get('resolution') is not None:
3082 return format['resolution']
3083 if format.get('width') and format.get('height'):
3084 res = '%dx%d' % (format['width'], format['height'])
3085 elif format.get('height'):
3086 res = '%sp' % format['height']
3087 elif format.get('width'):
3088 res = '%dx?' % format['width']
3089 else:
3090 res = default
3091 return res
3092
3093 def _format_note(self, fdict):
3094 res = ''
3095 if fdict.get('ext') in ['f4f', 'f4m']:
3096 res += '(unsupported) '
3097 if fdict.get('language'):
3098 if res:
3099 res += ' '
3100 res += '[%s] ' % fdict['language']
3101 if fdict.get('format_note') is not None:
3102 res += fdict['format_note'] + ' '
3103 if fdict.get('tbr') is not None:
3104 res += '%4dk ' % fdict['tbr']
3105 if fdict.get('container') is not None:
3106 if res:
3107 res += ', '
3108 res += '%s container' % fdict['container']
3109 if (fdict.get('vcodec') is not None
3110 and fdict.get('vcodec') != 'none'):
3111 if res:
3112 res += ', '
3113 res += fdict['vcodec']
3114 if fdict.get('vbr') is not None:
3115 res += '@'
3116 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3117 res += 'video@'
3118 if fdict.get('vbr') is not None:
3119 res += '%4dk' % fdict['vbr']
3120 if fdict.get('fps') is not None:
3121 if res:
3122 res += ', '
3123 res += '%sfps' % fdict['fps']
3124 if fdict.get('acodec') is not None:
3125 if res:
3126 res += ', '
3127 if fdict['acodec'] == 'none':
3128 res += 'video only'
3129 else:
3130 res += '%-5s' % fdict['acodec']
3131 elif fdict.get('abr') is not None:
3132 if res:
3133 res += ', '
3134 res += 'audio'
3135 if fdict.get('abr') is not None:
3136 res += '@%3dk' % fdict['abr']
3137 if fdict.get('asr') is not None:
3138 res += ' (%5dHz)' % fdict['asr']
3139 if fdict.get('filesize') is not None:
3140 if res:
3141 res += ', '
3142 res += format_bytes(fdict['filesize'])
3143 elif fdict.get('filesize_approx') is not None:
3144 if res:
3145 res += ', '
3146 res += '~' + format_bytes(fdict['filesize_approx'])
3147 return res
3148
3149 def list_formats(self, info_dict):
3150 formats = info_dict.get('formats', [info_dict])
3151 new_format = (
3152 'list-formats' not in self.params.get('compat_opts', [])
3153 and self.params.get('listformats_table', True) is not False)
3154 if new_format:
3155 table = [
3156 [
3157 format_field(f, 'format_id'),
3158 format_field(f, 'ext'),
3159 self.format_resolution(f),
3160 format_field(f, 'fps', '%d'),
3161 '|',
3162 format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
3163 format_field(f, 'tbr', '%4dk'),
3164 shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
3165 '|',
3166 format_field(f, 'vcodec', default='unknown').replace('none', ''),
3167 format_field(f, 'vbr', '%4dk'),
3168 format_field(f, 'acodec', default='unknown').replace('none', ''),
3169 format_field(f, 'abr', '%3dk'),
3170 format_field(f, 'asr', '%5dHz'),
3171 ', '.join(filter(None, (
3172 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
3173 format_field(f, 'language', '[%s]'),
3174 format_field(f, 'format_note'),
3175 format_field(f, 'container', ignore=(None, f.get('ext'))),
3176 ))),
3177 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3178 header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO',
3179 '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO']
3180 else:
3181 table = [
3182 [
3183 format_field(f, 'format_id'),
3184 format_field(f, 'ext'),
3185 self.format_resolution(f),
3186 self._format_note(f)]
3187 for f in formats
3188 if f.get('preference') is None or f['preference'] >= -1000]
3189 header_line = ['format code', 'extension', 'resolution', 'note']
3190
3191 self.to_screen(
3192 '[info] Available formats for %s:' % info_dict['id'])
3193 self.to_stdout(render_table(
3194 header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format))
3195
3196 def list_thumbnails(self, info_dict):
3197 thumbnails = list(info_dict.get('thumbnails'))
3198 if not thumbnails:
3199 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
3200 return
3201
3202 self.to_screen(
3203 '[info] Thumbnails for %s:' % info_dict['id'])
3204 self.to_stdout(render_table(
3205 ['ID', 'width', 'height', 'URL'],
3206 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
3207
3208 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3209 if not subtitles:
3210 self.to_screen('%s has no %s' % (video_id, name))
3211 return
3212 self.to_screen(
3213 'Available %s for %s:' % (name, video_id))
3214
3215 def _row(lang, formats):
3216 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3217 if len(set(names)) == 1:
3218 names = [] if names[0] == 'unknown' else names[:1]
3219 return [lang, ', '.join(names), ', '.join(exts)]
3220
3221 self.to_stdout(render_table(
3222 ['Language', 'Name', 'Formats'],
3223 [_row(lang, formats) for lang, formats in subtitles.items()],
3224 hideEmpty=True))
3225
3226 def urlopen(self, req):
3227 """ Start an HTTP download """
3228 if isinstance(req, compat_basestring):
3229 req = sanitized_Request(req)
3230 return self._opener.open(req, timeout=self._socket_timeout)
3231
3232 def print_debug_header(self):
3233 if not self.params.get('verbose'):
3234 return
3235
3236 stdout_encoding = getattr(
3237 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
3238 encoding_str = (
3239 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
3240 locale.getpreferredencoding(),
3241 sys.getfilesystemencoding(),
3242 stdout_encoding,
3243 self.get_encoding()))
3244 write_string(encoding_str, encoding=None)
3245
3246 source = (
3247 '(exe)' if hasattr(sys, 'frozen')
3248 else '(zip)' if isinstance(globals().get('__loader__'), zipimporter)
3249 else '(source)' if os.path.basename(sys.argv[0]) == '__main__.py'
3250 else '')
3251 self._write_string('[debug] yt-dlp version %s %s\n' % (__version__, source))
3252 if _LAZY_LOADER:
3253 self._write_string('[debug] Lazy loading extractors enabled\n')
3254 if _PLUGIN_CLASSES:
3255 self._write_string(
3256 '[debug] Plugin Extractors: %s\n' % [ie.ie_key() for ie in _PLUGIN_CLASSES])
3257 if self.params.get('compat_opts'):
3258 self._write_string(
3259 '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts')))
3260 try:
3261 sp = subprocess.Popen(
3262 ['git', 'rev-parse', '--short', 'HEAD'],
3263 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3264 cwd=os.path.dirname(os.path.abspath(__file__)))
3265 out, err = process_communicate_or_kill(sp)
3266 out = out.decode().strip()
3267 if re.match('[0-9a-f]+', out):
3268 self._write_string('[debug] Git HEAD: %s\n' % out)
3269 except Exception:
3270 try:
3271 sys.exc_clear()
3272 except Exception:
3273 pass
3274
3275 def python_implementation():
3276 impl_name = platform.python_implementation()
3277 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
3278 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
3279 return impl_name
3280
3281 self._write_string('[debug] Python version %s (%s %s) - %s\n' % (
3282 platform.python_version(),
3283 python_implementation(),
3284 platform.architecture()[0],
3285 platform_name()))
3286
3287 exe_versions = FFmpegPostProcessor.get_versions(self)
3288 exe_versions['rtmpdump'] = rtmpdump_version()
3289 exe_versions['phantomjs'] = PhantomJSwrapper._version()
3290 exe_str = ', '.join(
3291 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
3292 ) or 'none'
3293 self._write_string('[debug] exe versions: %s\n' % exe_str)
3294
3295 from .downloader.fragment import can_decrypt_frag
3296 from .downloader.websocket import has_websockets
3297 from .postprocessor.embedthumbnail import has_mutagen
3298 from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
3299
3300 lib_str = ', '.join(sorted(filter(None, (
3301 can_decrypt_frag and 'pycryptodome',
3302 has_websockets and 'websockets',
3303 has_mutagen and 'mutagen',
3304 SQLITE_AVAILABLE and 'sqlite',
3305 KEYRING_AVAILABLE and 'keyring',
3306 )))) or 'none'
3307 self._write_string('[debug] Optional libraries: %s\n' % lib_str)
3308
3309 proxy_map = {}
3310 for handler in self._opener.handlers:
3311 if hasattr(handler, 'proxies'):
3312 proxy_map.update(handler.proxies)
3313 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
3314
3315 if self.params.get('call_home', False):
3316 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
3317 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
3318 return
3319 latest_version = self.urlopen(
3320 'https://yt-dl.org/latest/version').read().decode('utf-8')
3321 if version_tuple(latest_version) > version_tuple(__version__):
3322 self.report_warning(
3323 'You are using an outdated version (newest version: %s)! '
3324 'See https://yt-dl.org/update if you need help updating.' %
3325 latest_version)
3326
3327 def _setup_opener(self):
3328 timeout_val = self.params.get('socket_timeout')
3329 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
3330
3331 opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
3332 opts_cookiefile = self.params.get('cookiefile')
3333 opts_proxy = self.params.get('proxy')
3334
3335 self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
3336
3337 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
3338 if opts_proxy is not None:
3339 if opts_proxy == '':
3340 proxies = {}
3341 else:
3342 proxies = {'http': opts_proxy, 'https': opts_proxy}
3343 else:
3344 proxies = compat_urllib_request.getproxies()
3345 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
3346 if 'http' in proxies and 'https' not in proxies:
3347 proxies['https'] = proxies['http']
3348 proxy_handler = PerRequestProxyHandler(proxies)
3349
3350 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
3351 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
3352 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
3353 redirect_handler = YoutubeDLRedirectHandler()
3354 data_handler = compat_urllib_request_DataHandler()
3355
3356 # When passing our own FileHandler instance, build_opener won't add the
3357 # default FileHandler and allows us to disable the file protocol, which
3358 # can be used for malicious purposes (see
3359 # https://github.com/ytdl-org/youtube-dl/issues/8227)
3360 file_handler = compat_urllib_request.FileHandler()
3361
3362 def file_open(*args, **kwargs):
3363 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
3364 file_handler.file_open = file_open
3365
3366 opener = compat_urllib_request.build_opener(
3367 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
3368
3369 # Delete the default user-agent header, which would otherwise apply in
3370 # cases where our custom HTTP handler doesn't come into play
3371 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
3372 opener.addheaders = []
3373 self._opener = opener
3374
3375 def encode(self, s):
3376 if isinstance(s, bytes):
3377 return s # Already encoded
3378
3379 try:
3380 return s.encode(self.get_encoding())
3381 except UnicodeEncodeError as err:
3382 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
3383 raise
3384
3385 def get_encoding(self):
3386 encoding = self.params.get('encoding')
3387 if encoding is None:
3388 encoding = preferredencoding()
3389 return encoding
3390
3391 def _write_thumbnails(self, info_dict, filename): # return the extensions
3392 write_all = self.params.get('write_all_thumbnails', False)
3393 thumbnails = []
3394 if write_all or self.params.get('writethumbnail', False):
3395 thumbnails = info_dict.get('thumbnails') or []
3396 multiple = write_all and len(thumbnails) > 1
3397
3398 ret = []
3399 for t in thumbnails[::-1]:
3400 thumb_ext = determine_ext(t['url'], 'jpg')
3401 suffix = '%s.' % t['id'] if multiple else ''
3402 thumb_display_id = '%s ' % t['id'] if multiple else ''
3403 thumb_filename = replace_extension(filename, suffix + thumb_ext, info_dict.get('ext'))
3404
3405 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(thumb_filename)):
3406 ret.append(suffix + thumb_ext)
3407 t['filepath'] = thumb_filename
3408 self.to_screen('[%s] %s: Thumbnail %sis already present' %
3409 (info_dict['extractor'], info_dict['id'], thumb_display_id))
3410 else:
3411 self.to_screen('[%s] %s: Downloading thumbnail %s ...' %
3412 (info_dict['extractor'], info_dict['id'], thumb_display_id))
3413 try:
3414 uf = self.urlopen(t['url'])
3415 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
3416 shutil.copyfileobj(uf, thumbf)
3417 ret.append(suffix + thumb_ext)
3418 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
3419 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
3420 t['filepath'] = thumb_filename
3421 except network_exceptions as err:
3422 self.report_warning('Unable to download thumbnail "%s": %s' %
3423 (t['url'], error_to_compat_str(err)))
3424 if ret and not write_all:
3425 break
3426 return ret