]> jfr.im git - yt-dlp.git/blob - yt_dlp/YoutubeDL.py
e79039b5ca33f67dab7416b81e299a4b2f65e781
[yt-dlp.git] / yt_dlp / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30 from zipimport import zipimporter
31
32 from .compat import (
33 compat_basestring,
34 compat_cookiejar,
35 compat_get_terminal_size,
36 compat_http_client,
37 compat_kwargs,
38 compat_numeric_types,
39 compat_os_name,
40 compat_str,
41 compat_tokenize_tokenize,
42 compat_urllib_error,
43 compat_urllib_request,
44 compat_urllib_request_DataHandler,
45 )
46 from .utils import (
47 age_restricted,
48 args_to_str,
49 ContentTooShortError,
50 date_from_str,
51 DateRange,
52 DEFAULT_OUTTMPL,
53 OUTTMPL_TYPES,
54 determine_ext,
55 determine_protocol,
56 DOT_DESKTOP_LINK_TEMPLATE,
57 DOT_URL_LINK_TEMPLATE,
58 DOT_WEBLOC_LINK_TEMPLATE,
59 DownloadError,
60 encode_compat_str,
61 encodeFilename,
62 error_to_compat_str,
63 ExistingVideoReached,
64 expand_path,
65 ExtractorError,
66 float_or_none,
67 format_bytes,
68 format_field,
69 formatSeconds,
70 GeoRestrictedError,
71 int_or_none,
72 iri_to_uri,
73 ISO3166Utils,
74 locked_file,
75 make_dir,
76 make_HTTPS_handler,
77 MaxDownloadsReached,
78 orderedSet,
79 PagedList,
80 parse_filesize,
81 PerRequestProxyHandler,
82 platform_name,
83 PostProcessingError,
84 preferredencoding,
85 prepend_extension,
86 register_socks_protocols,
87 render_table,
88 replace_extension,
89 RejectedVideoReached,
90 SameFileError,
91 sanitize_filename,
92 sanitize_path,
93 sanitize_url,
94 sanitized_Request,
95 std_headers,
96 str_or_none,
97 strftime_or_none,
98 subtitles_filename,
99 to_high_limit_path,
100 UnavailableVideoError,
101 url_basename,
102 version_tuple,
103 write_json_file,
104 write_string,
105 YoutubeDLCookieJar,
106 YoutubeDLCookieProcessor,
107 YoutubeDLHandler,
108 YoutubeDLRedirectHandler,
109 process_communicate_or_kill,
110 )
111 from .cache import Cache
112 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER, _PLUGIN_CLASSES
113 from .extractor.openload import PhantomJSwrapper
114 from .downloader import get_suitable_downloader
115 from .downloader.rtmp import rtmpdump_version
116 from .postprocessor import (
117 FFmpegFixupM3u8PP,
118 FFmpegFixupM4aPP,
119 FFmpegFixupStretchedPP,
120 FFmpegMergerPP,
121 FFmpegPostProcessor,
122 # FFmpegSubtitlesConvertorPP,
123 get_postprocessor,
124 MoveFilesAfterDownloadPP,
125 )
126 from .version import __version__
127
128 if compat_os_name == 'nt':
129 import ctypes
130
131
132 class YoutubeDL(object):
133 """YoutubeDL class.
134
135 YoutubeDL objects are the ones responsible of downloading the
136 actual video file and writing it to disk if the user has requested
137 it, among some other tasks. In most cases there should be one per
138 program. As, given a video URL, the downloader doesn't know how to
139 extract all the needed information, task that InfoExtractors do, it
140 has to pass the URL to one of them.
141
142 For this, YoutubeDL objects have a method that allows
143 InfoExtractors to be registered in a given order. When it is passed
144 a URL, the YoutubeDL object handles it to the first InfoExtractor it
145 finds that reports being able to handle it. The InfoExtractor extracts
146 all the information about the video or videos the URL refers to, and
147 YoutubeDL process the extracted information, possibly using a File
148 Downloader to download the video.
149
150 YoutubeDL objects accept a lot of parameters. In order not to saturate
151 the object constructor with arguments, it receives a dictionary of
152 options instead. These options are available through the params
153 attribute for the InfoExtractors to use. The YoutubeDL also
154 registers itself as the downloader in charge for the InfoExtractors
155 that are added to it, so this is a "mutual registration".
156
157 Available options:
158
159 username: Username for authentication purposes.
160 password: Password for authentication purposes.
161 videopassword: Password for accessing a video.
162 ap_mso: Adobe Pass multiple-system operator identifier.
163 ap_username: Multiple-system operator account username.
164 ap_password: Multiple-system operator account password.
165 usenetrc: Use netrc for authentication instead.
166 verbose: Print additional info to stdout.
167 quiet: Do not print messages to stdout.
168 no_warnings: Do not print out anything for warnings.
169 forceurl: Force printing final URL.
170 forcetitle: Force printing title.
171 forceid: Force printing ID.
172 forcethumbnail: Force printing thumbnail URL.
173 forcedescription: Force printing description.
174 forcefilename: Force printing final filename.
175 forceduration: Force printing duration.
176 forcejson: Force printing info_dict as JSON.
177 dump_single_json: Force printing the info_dict of the whole playlist
178 (or video) as a single JSON line.
179 force_write_download_archive: Force writing download archive regardless
180 of 'skip_download' or 'simulate'.
181 simulate: Do not download the video files.
182 format: Video format code. see "FORMAT SELECTION" for more details.
183 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
184 format_sort: How to sort the video formats. see "Sorting Formats"
185 for more details.
186 format_sort_force: Force the given format_sort. see "Sorting Formats"
187 for more details.
188 allow_multiple_video_streams: Allow multiple video streams to be merged
189 into a single file
190 allow_multiple_audio_streams: Allow multiple audio streams to be merged
191 into a single file
192 paths: Dictionary of output paths. The allowed keys are 'home'
193 'temp' and the keys of OUTTMPL_TYPES (in utils.py)
194 outtmpl: Dictionary of templates for output names. Allowed keys
195 are 'default' and the keys of OUTTMPL_TYPES (in utils.py).
196 A string a also accepted for backward compatibility
197 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
198 restrictfilenames: Do not allow "&" and spaces in file names
199 trim_file_name: Limit length of filename (extension excluded)
200 windowsfilenames: Force the filenames to be windows compatible
201 ignoreerrors: Do not stop on download errors
202 (Default True when running yt-dlp,
203 but False when directly accessing YoutubeDL class)
204 force_generic_extractor: Force downloader to use the generic extractor
205 overwrites: Overwrite all video and metadata files if True,
206 overwrite only non-video files if None
207 and don't overwrite any file if False
208 playliststart: Playlist item to start at.
209 playlistend: Playlist item to end at.
210 playlist_items: Specific indices of playlist to download.
211 playlistreverse: Download playlist items in reverse order.
212 playlistrandom: Download playlist items in random order.
213 matchtitle: Download only matching titles.
214 rejecttitle: Reject downloads for matching titles.
215 logger: Log messages to a logging.Logger instance.
216 logtostderr: Log messages to stderr instead of stdout.
217 writedescription: Write the video description to a .description file
218 writeinfojson: Write the video description to a .info.json file
219 clean_infojson: Remove private fields from the infojson
220 writecomments: Extract video comments. This will not be written to disk
221 unless writeinfojson is also given
222 writeannotations: Write the video annotations to a .annotations.xml file
223 writethumbnail: Write the thumbnail image to a file
224 allow_playlist_files: Whether to write playlists' description, infojson etc
225 also to disk when using the 'write*' options
226 write_all_thumbnails: Write all thumbnail formats to files
227 writelink: Write an internet shortcut file, depending on the
228 current platform (.url/.webloc/.desktop)
229 writeurllink: Write a Windows internet shortcut file (.url)
230 writewebloclink: Write a macOS internet shortcut file (.webloc)
231 writedesktoplink: Write a Linux internet shortcut file (.desktop)
232 writesubtitles: Write the video subtitles to a file
233 writeautomaticsub: Write the automatically generated subtitles to a file
234 allsubtitles: Downloads all the subtitles of the video
235 (requires writesubtitles or writeautomaticsub)
236 listsubtitles: Lists all available subtitles for the video
237 subtitlesformat: The format code for subtitles
238 subtitleslangs: List of languages of the subtitles to download
239 keepvideo: Keep the video file after post-processing
240 daterange: A DateRange object, download only if the upload_date is in the range.
241 skip_download: Skip the actual download of the video file
242 cachedir: Location of the cache files in the filesystem.
243 False to disable filesystem cache.
244 noplaylist: Download single video instead of a playlist if in doubt.
245 age_limit: An integer representing the user's age in years.
246 Unsuitable videos for the given age are skipped.
247 min_views: An integer representing the minimum view count the video
248 must have in order to not be skipped.
249 Videos without view count information are always
250 downloaded. None for no limit.
251 max_views: An integer representing the maximum view count.
252 Videos that are more popular than that are not
253 downloaded.
254 Videos without view count information are always
255 downloaded. None for no limit.
256 download_archive: File name of a file where all downloads are recorded.
257 Videos already present in the file are not downloaded
258 again.
259 break_on_existing: Stop the download process after attempting to download a
260 file that is in the archive.
261 break_on_reject: Stop the download process when encountering a video that
262 has been filtered out.
263 cookiefile: File name where cookies should be read from and dumped to
264 nocheckcertificate:Do not verify SSL certificates
265 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
266 At the moment, this is only supported by YouTube.
267 proxy: URL of the proxy server to use
268 geo_verification_proxy: URL of the proxy to use for IP address verification
269 on geo-restricted sites.
270 socket_timeout: Time to wait for unresponsive hosts, in seconds
271 bidi_workaround: Work around buggy terminals without bidirectional text
272 support, using fridibi
273 debug_printtraffic:Print out sent and received HTTP traffic
274 include_ads: Download ads as well
275 default_search: Prepend this string if an input url is not valid.
276 'auto' for elaborate guessing
277 encoding: Use this encoding instead of the system-specified.
278 extract_flat: Do not resolve URLs, return the immediate result.
279 Pass in 'in_playlist' to only show this behavior for
280 playlist items.
281 postprocessors: A list of dictionaries, each with an entry
282 * key: The name of the postprocessor. See
283 yt_dlp/postprocessor/__init__.py for a list.
284 * _after_move: Optional. If True, run this post_processor
285 after 'MoveFilesAfterDownload'
286 as well as any further keyword arguments for the
287 postprocessor.
288 post_hooks: A list of functions that get called as the final step
289 for each video file, after all postprocessors have been
290 called. The filename will be passed as the only argument.
291 progress_hooks: A list of functions that get called on download
292 progress, with a dictionary with the entries
293 * status: One of "downloading", "error", or "finished".
294 Check this first and ignore unknown values.
295
296 If status is one of "downloading", or "finished", the
297 following properties may also be present:
298 * filename: The final filename (always present)
299 * tmpfilename: The filename we're currently writing to
300 * downloaded_bytes: Bytes on disk
301 * total_bytes: Size of the whole file, None if unknown
302 * total_bytes_estimate: Guess of the eventual file size,
303 None if unavailable.
304 * elapsed: The number of seconds since download started.
305 * eta: The estimated time in seconds, None if unknown
306 * speed: The download speed in bytes/second, None if
307 unknown
308 * fragment_index: The counter of the currently
309 downloaded video fragment.
310 * fragment_count: The number of fragments (= individual
311 files that will be merged)
312
313 Progress hooks are guaranteed to be called at least once
314 (with status "finished") if the download is successful.
315 merge_output_format: Extension to use when merging formats.
316 final_ext: Expected final extension; used to detect when the file was
317 already downloaded and converted. "merge_output_format" is
318 replaced by this extension when given
319 fixup: Automatically correct known faults of the file.
320 One of:
321 - "never": do nothing
322 - "warn": only emit a warning
323 - "detect_or_warn": check whether we can do anything
324 about it, warn otherwise (default)
325 source_address: Client-side IP address to bind to.
326 call_home: Boolean, true iff we are allowed to contact the
327 yt-dlp servers for debugging. (BROKEN)
328 sleep_interval_requests: Number of seconds to sleep between requests
329 during extraction
330 sleep_interval: Number of seconds to sleep before each download when
331 used alone or a lower bound of a range for randomized
332 sleep before each download (minimum possible number
333 of seconds to sleep) when used along with
334 max_sleep_interval.
335 max_sleep_interval:Upper bound of a range for randomized sleep before each
336 download (maximum possible number of seconds to sleep).
337 Must only be used along with sleep_interval.
338 Actual sleep time will be a random float from range
339 [sleep_interval; max_sleep_interval].
340 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
341 listformats: Print an overview of available video formats and exit.
342 list_thumbnails: Print a table of all thumbnails and exit.
343 match_filter: A function that gets called with the info_dict of
344 every video.
345 If it returns a message, the video is ignored.
346 If it returns None, the video is downloaded.
347 match_filter_func in utils.py is one example for this.
348 no_color: Do not emit color codes in output.
349 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
350 HTTP header
351 geo_bypass_country:
352 Two-letter ISO 3166-2 country code that will be used for
353 explicit geographic restriction bypassing via faking
354 X-Forwarded-For HTTP header
355 geo_bypass_ip_block:
356 IP range in CIDR notation that will be used similarly to
357 geo_bypass_country
358
359 The following options determine which downloader is picked:
360 external_downloader: Executable of the external downloader to call.
361 None or unset for standard (built-in) downloader.
362 hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
363 if True, otherwise use ffmpeg/avconv if False, otherwise
364 use downloader suggested by extractor if None.
365
366 The following parameters are not used by YoutubeDL itself, they are used by
367 the downloader (see yt_dlp/downloader/common.py):
368 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
369 noresizebuffer, retries, continuedl, noprogress, consoletitle,
370 xattr_set_filesize, external_downloader_args, hls_use_mpegts,
371 http_chunk_size.
372
373 The following options are used by the post processors:
374 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
375 otherwise prefer ffmpeg. (avconv support is deprecated)
376 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
377 to the binary or its containing directory.
378 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
379 and a list of additional command-line arguments for the
380 postprocessor/executable. The dict can also have "PP+EXE" keys
381 which are used when the given exe is used by the given PP.
382 Use 'default' as the name for arguments to passed to all PP
383
384 The following options are used by the extractors:
385 extractor_retries: Number of times to retry for known errors
386 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
387 hls_split_discontinuity: Split HLS playlists to different formats at
388 discontinuities such as ad breaks (default: False)
389 youtube_include_dash_manifest: If True (default), DASH manifests and related
390 data will be downloaded and processed by extractor.
391 You can reduce network I/O by disabling it if you don't
392 care about DASH. (only for youtube)
393 youtube_include_hls_manifest: If True (default), HLS manifests and related
394 data will be downloaded and processed by extractor.
395 You can reduce network I/O by disabling it if you don't
396 care about HLS. (only for youtube)
397 """
398
399 _NUMERIC_FIELDS = set((
400 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
401 'timestamp', 'upload_year', 'upload_month', 'upload_day',
402 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
403 'average_rating', 'comment_count', 'age_limit',
404 'start_time', 'end_time',
405 'chapter_number', 'season_number', 'episode_number',
406 'track_number', 'disc_number', 'release_year',
407 'playlist_index',
408 ))
409
410 params = None
411 _ies = []
412 _pps = {'beforedl': [], 'aftermove': [], 'normal': []}
413 __prepare_filename_warned = False
414 _first_webpage_request = True
415 _download_retcode = None
416 _num_downloads = None
417 _playlist_level = 0
418 _playlist_urls = set()
419 _screen_file = None
420
421 def __init__(self, params=None, auto_init=True):
422 """Create a FileDownloader object with the given options."""
423 if params is None:
424 params = {}
425 self._ies = []
426 self._ies_instances = {}
427 self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
428 self.__prepare_filename_warned = False
429 self._first_webpage_request = True
430 self._post_hooks = []
431 self._progress_hooks = []
432 self._download_retcode = 0
433 self._num_downloads = 0
434 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
435 self._err_file = sys.stderr
436 self.params = {
437 # Default parameters
438 'nocheckcertificate': False,
439 }
440 self.params.update(params)
441 self.cache = Cache(self)
442 self.archive = set()
443
444 """Preload the archive, if any is specified"""
445 def preload_download_archive(self):
446 fn = self.params.get('download_archive')
447 if fn is None:
448 return False
449 try:
450 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
451 for line in archive_file:
452 self.archive.add(line.strip())
453 except IOError as ioe:
454 if ioe.errno != errno.ENOENT:
455 raise
456 return False
457 return True
458
459 def check_deprecated(param, option, suggestion):
460 if self.params.get(param) is not None:
461 self.report_warning(
462 '%s is deprecated. Use %s instead.' % (option, suggestion))
463 return True
464 return False
465
466 if self.params.get('verbose'):
467 self.to_stdout('[debug] Loading archive file %r' % self.params.get('download_archive'))
468
469 preload_download_archive(self)
470
471 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
472 if self.params.get('geo_verification_proxy') is None:
473 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
474
475 if self.params.get('final_ext'):
476 if self.params.get('merge_output_format'):
477 self.report_warning('--merge-output-format will be ignored since --remux-video or --recode-video is given')
478 self.params['merge_output_format'] = self.params['final_ext']
479
480 if 'overwrites' in self.params and self.params['overwrites'] is None:
481 del self.params['overwrites']
482
483 check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
484 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
485 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
486
487 if params.get('bidi_workaround', False):
488 try:
489 import pty
490 master, slave = pty.openpty()
491 width = compat_get_terminal_size().columns
492 if width is None:
493 width_args = []
494 else:
495 width_args = ['-w', str(width)]
496 sp_kwargs = dict(
497 stdin=subprocess.PIPE,
498 stdout=slave,
499 stderr=self._err_file)
500 try:
501 self._output_process = subprocess.Popen(
502 ['bidiv'] + width_args, **sp_kwargs
503 )
504 except OSError:
505 self._output_process = subprocess.Popen(
506 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
507 self._output_channel = os.fdopen(master, 'rb')
508 except OSError as ose:
509 if ose.errno == errno.ENOENT:
510 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
511 else:
512 raise
513
514 if (sys.platform != 'win32'
515 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
516 and not params.get('restrictfilenames', False)):
517 # Unicode filesystem API will throw errors (#1474, #13027)
518 self.report_warning(
519 'Assuming --restrict-filenames since file system encoding '
520 'cannot encode all characters. '
521 'Set the LC_ALL environment variable to fix this.')
522 self.params['restrictfilenames'] = True
523
524 self.outtmpl_dict = self.parse_outtmpl()
525
526 self._setup_opener()
527
528 if auto_init:
529 self.print_debug_header()
530 self.add_default_info_extractors()
531
532 for pp_def_raw in self.params.get('postprocessors', []):
533 pp_class = get_postprocessor(pp_def_raw['key'])
534 pp_def = dict(pp_def_raw)
535 del pp_def['key']
536 if 'when' in pp_def:
537 when = pp_def['when']
538 del pp_def['when']
539 else:
540 when = 'normal'
541 pp = pp_class(self, **compat_kwargs(pp_def))
542 self.add_post_processor(pp, when=when)
543
544 for ph in self.params.get('post_hooks', []):
545 self.add_post_hook(ph)
546
547 for ph in self.params.get('progress_hooks', []):
548 self.add_progress_hook(ph)
549
550 register_socks_protocols()
551
552 def warn_if_short_id(self, argv):
553 # short YouTube ID starting with dash?
554 idxs = [
555 i for i, a in enumerate(argv)
556 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
557 if idxs:
558 correct_argv = (
559 ['yt-dlp']
560 + [a for i, a in enumerate(argv) if i not in idxs]
561 + ['--'] + [argv[i] for i in idxs]
562 )
563 self.report_warning(
564 'Long argument string detected. '
565 'Use -- to separate parameters and URLs, like this:\n%s\n' %
566 args_to_str(correct_argv))
567
568 def add_info_extractor(self, ie):
569 """Add an InfoExtractor object to the end of the list."""
570 self._ies.append(ie)
571 if not isinstance(ie, type):
572 self._ies_instances[ie.ie_key()] = ie
573 ie.set_downloader(self)
574
575 def get_info_extractor(self, ie_key):
576 """
577 Get an instance of an IE with name ie_key, it will try to get one from
578 the _ies list, if there's no instance it will create a new one and add
579 it to the extractor list.
580 """
581 ie = self._ies_instances.get(ie_key)
582 if ie is None:
583 ie = get_info_extractor(ie_key)()
584 self.add_info_extractor(ie)
585 return ie
586
587 def add_default_info_extractors(self):
588 """
589 Add the InfoExtractors returned by gen_extractors to the end of the list
590 """
591 for ie in gen_extractor_classes():
592 self.add_info_extractor(ie)
593
594 def add_post_processor(self, pp, when='normal'):
595 """Add a PostProcessor object to the end of the chain."""
596 self._pps[when].append(pp)
597 pp.set_downloader(self)
598
599 def add_post_hook(self, ph):
600 """Add the post hook"""
601 self._post_hooks.append(ph)
602
603 def add_progress_hook(self, ph):
604 """Add the progress hook (currently only for the file downloader)"""
605 self._progress_hooks.append(ph)
606
607 def _bidi_workaround(self, message):
608 if not hasattr(self, '_output_channel'):
609 return message
610
611 assert hasattr(self, '_output_process')
612 assert isinstance(message, compat_str)
613 line_count = message.count('\n') + 1
614 self._output_process.stdin.write((message + '\n').encode('utf-8'))
615 self._output_process.stdin.flush()
616 res = ''.join(self._output_channel.readline().decode('utf-8')
617 for _ in range(line_count))
618 return res[:-len('\n')]
619
620 def to_screen(self, message, skip_eol=False):
621 """Print message to stdout if not in quiet mode."""
622 return self.to_stdout(message, skip_eol, check_quiet=True)
623
624 def _write_string(self, s, out=None):
625 write_string(s, out=out, encoding=self.params.get('encoding'))
626
627 def to_stdout(self, message, skip_eol=False, check_quiet=False):
628 """Print message to stdout if not in quiet mode."""
629 if self.params.get('logger'):
630 self.params['logger'].debug(message)
631 elif not check_quiet or not self.params.get('quiet', False):
632 message = self._bidi_workaround(message)
633 terminator = ['\n', ''][skip_eol]
634 output = message + terminator
635
636 self._write_string(output, self._screen_file)
637
638 def to_stderr(self, message):
639 """Print message to stderr."""
640 assert isinstance(message, compat_str)
641 if self.params.get('logger'):
642 self.params['logger'].error(message)
643 else:
644 message = self._bidi_workaround(message)
645 output = message + '\n'
646 self._write_string(output, self._err_file)
647
648 def to_console_title(self, message):
649 if not self.params.get('consoletitle', False):
650 return
651 if compat_os_name == 'nt':
652 if ctypes.windll.kernel32.GetConsoleWindow():
653 # c_wchar_p() might not be necessary if `message` is
654 # already of type unicode()
655 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
656 elif 'TERM' in os.environ:
657 self._write_string('\033]0;%s\007' % message, self._screen_file)
658
659 def save_console_title(self):
660 if not self.params.get('consoletitle', False):
661 return
662 if self.params.get('simulate', False):
663 return
664 if compat_os_name != 'nt' and 'TERM' in os.environ:
665 # Save the title on stack
666 self._write_string('\033[22;0t', self._screen_file)
667
668 def restore_console_title(self):
669 if not self.params.get('consoletitle', False):
670 return
671 if self.params.get('simulate', False):
672 return
673 if compat_os_name != 'nt' and 'TERM' in os.environ:
674 # Restore the title from stack
675 self._write_string('\033[23;0t', self._screen_file)
676
677 def __enter__(self):
678 self.save_console_title()
679 return self
680
681 def __exit__(self, *args):
682 self.restore_console_title()
683
684 if self.params.get('cookiefile') is not None:
685 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
686
687 def trouble(self, message=None, tb=None):
688 """Determine action to take when a download problem appears.
689
690 Depending on if the downloader has been configured to ignore
691 download errors or not, this method may throw an exception or
692 not when errors are found, after printing the message.
693
694 tb, if given, is additional traceback information.
695 """
696 if message is not None:
697 self.to_stderr(message)
698 if self.params.get('verbose'):
699 if tb is None:
700 if sys.exc_info()[0]: # if .trouble has been called from an except block
701 tb = ''
702 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
703 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
704 tb += encode_compat_str(traceback.format_exc())
705 else:
706 tb_data = traceback.format_list(traceback.extract_stack())
707 tb = ''.join(tb_data)
708 self.to_stderr(tb)
709 if not self.params.get('ignoreerrors', False):
710 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
711 exc_info = sys.exc_info()[1].exc_info
712 else:
713 exc_info = sys.exc_info()
714 raise DownloadError(message, exc_info)
715 self._download_retcode = 1
716
717 def report_warning(self, message):
718 '''
719 Print the message to stderr, it will be prefixed with 'WARNING:'
720 If stderr is a tty file the 'WARNING:' will be colored
721 '''
722 if self.params.get('logger') is not None:
723 self.params['logger'].warning(message)
724 else:
725 if self.params.get('no_warnings'):
726 return
727 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
728 _msg_header = '\033[0;33mWARNING:\033[0m'
729 else:
730 _msg_header = 'WARNING:'
731 warning_message = '%s %s' % (_msg_header, message)
732 self.to_stderr(warning_message)
733
734 def report_error(self, message, tb=None):
735 '''
736 Do the same as trouble, but prefixes the message with 'ERROR:', colored
737 in red if stderr is a tty file.
738 '''
739 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
740 _msg_header = '\033[0;31mERROR:\033[0m'
741 else:
742 _msg_header = 'ERROR:'
743 error_message = '%s %s' % (_msg_header, message)
744 self.trouble(error_message, tb)
745
746 def report_file_already_downloaded(self, file_name):
747 """Report file has already been fully downloaded."""
748 try:
749 self.to_screen('[download] %s has already been downloaded' % file_name)
750 except UnicodeEncodeError:
751 self.to_screen('[download] The file has already been downloaded')
752
753 def report_file_delete(self, file_name):
754 """Report that existing file will be deleted."""
755 try:
756 self.to_screen('Deleting existing file %s' % file_name)
757 except UnicodeEncodeError:
758 self.to_screen('Deleting existing file')
759
760 def parse_outtmpl(self):
761 outtmpl_dict = self.params.get('outtmpl', {})
762 if not isinstance(outtmpl_dict, dict):
763 outtmpl_dict = {'default': outtmpl_dict}
764 outtmpl_dict.update({
765 k: v for k, v in DEFAULT_OUTTMPL.items()
766 if not outtmpl_dict.get(k)})
767 for key, val in outtmpl_dict.items():
768 if isinstance(val, bytes):
769 self.report_warning(
770 'Parameter outtmpl is bytes, but should be a unicode string. '
771 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
772 return outtmpl_dict
773
774 def _prepare_filename(self, info_dict, tmpl_type='default'):
775 try:
776 template_dict = dict(info_dict)
777
778 template_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
779 formatSeconds(info_dict['duration'], '-')
780 if info_dict.get('duration', None) is not None
781 else None)
782
783 template_dict['epoch'] = int(time.time())
784 autonumber_size = self.params.get('autonumber_size')
785 if autonumber_size is None:
786 autonumber_size = 5
787 template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
788 if template_dict.get('resolution') is None:
789 if template_dict.get('width') and template_dict.get('height'):
790 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
791 elif template_dict.get('height'):
792 template_dict['resolution'] = '%sp' % template_dict['height']
793 elif template_dict.get('width'):
794 template_dict['resolution'] = '%dx?' % template_dict['width']
795
796 sanitize = lambda k, v: sanitize_filename(
797 compat_str(v),
798 restricted=self.params.get('restrictfilenames'),
799 is_id=(k == 'id' or k.endswith('_id')))
800 template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
801 for k, v in template_dict.items()
802 if v is not None and not isinstance(v, (list, tuple, dict)))
803 na = self.params.get('outtmpl_na_placeholder', 'NA')
804 template_dict = collections.defaultdict(lambda: na, template_dict)
805
806 outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
807 force_ext = OUTTMPL_TYPES.get(tmpl_type)
808
809 # For fields playlist_index and autonumber convert all occurrences
810 # of %(field)s to %(field)0Nd for backward compatibility
811 field_size_compat_map = {
812 'playlist_index': len(str(template_dict['n_entries'])),
813 'autonumber': autonumber_size,
814 }
815 FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
816 mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
817 if mobj:
818 outtmpl = re.sub(
819 FIELD_SIZE_COMPAT_RE,
820 r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
821 outtmpl)
822
823 # As of [1] format syntax is:
824 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
825 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
826 FORMAT_RE = r'''(?x)
827 (?<!%)
828 %
829 \({0}\) # mapping key
830 (?:[#0\-+ ]+)? # conversion flags (optional)
831 (?:\d+)? # minimum field width (optional)
832 (?:\.\d+)? # precision (optional)
833 [hlL]? # length modifier (optional)
834 (?P<type>[diouxXeEfFgGcrs%]) # conversion type
835 '''
836
837 numeric_fields = list(self._NUMERIC_FIELDS)
838
839 # Format date
840 FORMAT_DATE_RE = FORMAT_RE.format(r'(?P<key>(?P<field>\w+)>(?P<format>.+?))')
841 for mobj in re.finditer(FORMAT_DATE_RE, outtmpl):
842 conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key')
843 if key in template_dict:
844 continue
845 value = strftime_or_none(template_dict.get(field), frmt, na)
846 if conv_type in 'crs': # string
847 value = sanitize(field, value)
848 else: # number
849 numeric_fields.append(key)
850 value = float_or_none(value, default=None)
851 if value is not None:
852 template_dict[key] = value
853
854 # Missing numeric fields used together with integer presentation types
855 # in format specification will break the argument substitution since
856 # string NA placeholder is returned for missing fields. We will patch
857 # output template for missing fields to meet string presentation type.
858 for numeric_field in numeric_fields:
859 if numeric_field not in template_dict:
860 outtmpl = re.sub(
861 FORMAT_RE.format(re.escape(numeric_field)),
862 r'%({0})s'.format(numeric_field), outtmpl)
863
864 # expand_path translates '%%' into '%' and '$$' into '$'
865 # correspondingly that is not what we want since we need to keep
866 # '%%' intact for template dict substitution step. Working around
867 # with boundary-alike separator hack.
868 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
869 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
870
871 # outtmpl should be expand_path'ed before template dict substitution
872 # because meta fields may contain env variables we don't want to
873 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
874 # title "Hello $PATH", we don't want `$PATH` to be expanded.
875 filename = expand_path(outtmpl).replace(sep, '') % template_dict
876
877 if force_ext is not None:
878 filename = replace_extension(filename, force_ext, template_dict.get('ext'))
879
880 # https://github.com/blackjack4494/youtube-dlc/issues/85
881 trim_file_name = self.params.get('trim_file_name', False)
882 if trim_file_name:
883 fn_groups = filename.rsplit('.')
884 ext = fn_groups[-1]
885 sub_ext = ''
886 if len(fn_groups) > 2:
887 sub_ext = fn_groups[-2]
888 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
889
890 return filename
891 except ValueError as err:
892 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
893 return None
894
895 def prepare_filename(self, info_dict, dir_type='', warn=False):
896 """Generate the output filename."""
897 paths = self.params.get('paths', {})
898 assert isinstance(paths, dict)
899 filename = self._prepare_filename(info_dict, dir_type or 'default')
900
901 if warn and not self.__prepare_filename_warned:
902 if not paths:
903 pass
904 elif filename == '-':
905 self.report_warning('--paths is ignored when an outputting to stdout')
906 elif os.path.isabs(filename):
907 self.report_warning('--paths is ignored since an absolute path is given in output template')
908 self.__prepare_filename_warned = True
909 if filename == '-' or not filename:
910 return filename
911
912 homepath = expand_path(paths.get('home', '').strip())
913 assert isinstance(homepath, compat_str)
914 subdir = expand_path(paths.get(dir_type, '').strip()) if dir_type else ''
915 assert isinstance(subdir, compat_str)
916 path = os.path.join(homepath, subdir, filename)
917
918 # Temporary fix for #4787
919 # 'Treat' all problem characters by passing filename through preferredencoding
920 # to workaround encoding issues with subprocess on python2 @ Windows
921 if sys.version_info < (3, 0) and sys.platform == 'win32':
922 path = encodeFilename(path, True).decode(preferredencoding())
923 return sanitize_path(path, force=self.params.get('windowsfilenames'))
924
925 def _match_entry(self, info_dict, incomplete):
926 """ Returns None if the file should be downloaded """
927
928 def check_filter():
929 video_title = info_dict.get('title', info_dict.get('id', 'video'))
930 if 'title' in info_dict:
931 # This can happen when we're just evaluating the playlist
932 title = info_dict['title']
933 matchtitle = self.params.get('matchtitle', False)
934 if matchtitle:
935 if not re.search(matchtitle, title, re.IGNORECASE):
936 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
937 rejecttitle = self.params.get('rejecttitle', False)
938 if rejecttitle:
939 if re.search(rejecttitle, title, re.IGNORECASE):
940 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
941 date = info_dict.get('upload_date')
942 if date is not None:
943 dateRange = self.params.get('daterange', DateRange())
944 if date not in dateRange:
945 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
946 view_count = info_dict.get('view_count')
947 if view_count is not None:
948 min_views = self.params.get('min_views')
949 if min_views is not None and view_count < min_views:
950 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
951 max_views = self.params.get('max_views')
952 if max_views is not None and view_count > max_views:
953 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
954 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
955 return 'Skipping "%s" because it is age restricted' % video_title
956 if self.in_download_archive(info_dict):
957 return '%s has already been recorded in archive' % video_title
958
959 if not incomplete:
960 match_filter = self.params.get('match_filter')
961 if match_filter is not None:
962 ret = match_filter(info_dict)
963 if ret is not None:
964 return ret
965 return None
966
967 reason = check_filter()
968 if reason is not None:
969 self.to_screen('[download] ' + reason)
970 if reason.endswith('has already been recorded in the archive') and self.params.get('break_on_existing', False):
971 raise ExistingVideoReached()
972 elif self.params.get('break_on_reject', False):
973 raise RejectedVideoReached()
974 return reason
975
976 @staticmethod
977 def add_extra_info(info_dict, extra_info):
978 '''Set the keys from extra_info in info dict if they are missing'''
979 for key, value in extra_info.items():
980 info_dict.setdefault(key, value)
981
982 def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_info={},
983 process=True, force_generic_extractor=False):
984 '''
985 Returns a list with a dictionary for each video we find.
986 If 'download', also downloads the videos.
987 extra_info is a dict containing the extra values to add to each result
988 '''
989
990 if not ie_key and force_generic_extractor:
991 ie_key = 'Generic'
992
993 if ie_key:
994 ies = [self.get_info_extractor(ie_key)]
995 else:
996 ies = self._ies
997
998 for ie in ies:
999 if not ie.suitable(url):
1000 continue
1001
1002 ie_key = ie.ie_key()
1003 ie = self.get_info_extractor(ie_key)
1004 if not ie.working():
1005 self.report_warning('The program functionality for this site has been marked as broken, '
1006 'and will probably not work.')
1007
1008 try:
1009 temp_id = str_or_none(
1010 ie.extract_id(url) if callable(getattr(ie, 'extract_id', None))
1011 else ie._match_id(url))
1012 except (AssertionError, IndexError, AttributeError):
1013 temp_id = None
1014 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
1015 self.to_screen("[%s] %s: has already been recorded in archive" % (
1016 ie_key, temp_id))
1017 break
1018 return self.__extract_info(url, ie, download, extra_info, process, info_dict)
1019 else:
1020 self.report_error('no suitable InfoExtractor for URL %s' % url)
1021
1022 def __handle_extraction_exceptions(func):
1023 def wrapper(self, *args, **kwargs):
1024 try:
1025 return func(self, *args, **kwargs)
1026 except GeoRestrictedError as e:
1027 msg = e.msg
1028 if e.countries:
1029 msg += '\nThis video is available in %s.' % ', '.join(
1030 map(ISO3166Utils.short2full, e.countries))
1031 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1032 self.report_error(msg)
1033 except ExtractorError as e: # An error we somewhat expected
1034 self.report_error(compat_str(e), e.format_traceback())
1035 except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached):
1036 raise
1037 except Exception as e:
1038 if self.params.get('ignoreerrors', False):
1039 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
1040 else:
1041 raise
1042 return wrapper
1043
1044 @__handle_extraction_exceptions
1045 def __extract_info(self, url, ie, download, extra_info, process, info_dict):
1046 ie_result = ie.extract(url)
1047 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1048 return
1049 if isinstance(ie_result, list):
1050 # Backwards compatibility: old IE result format
1051 ie_result = {
1052 '_type': 'compat_list',
1053 'entries': ie_result,
1054 }
1055 if info_dict:
1056 if info_dict.get('id'):
1057 ie_result['id'] = info_dict['id']
1058 if info_dict.get('title'):
1059 ie_result['title'] = info_dict['title']
1060 self.add_default_extra_info(ie_result, ie, url)
1061 if process:
1062 return self.process_ie_result(ie_result, download, extra_info)
1063 else:
1064 return ie_result
1065
1066 def add_default_extra_info(self, ie_result, ie, url):
1067 self.add_extra_info(ie_result, {
1068 'extractor': ie.IE_NAME,
1069 'webpage_url': url,
1070 'webpage_url_basename': url_basename(url),
1071 'extractor_key': ie.ie_key(),
1072 })
1073
1074 def process_ie_result(self, ie_result, download=True, extra_info={}):
1075 """
1076 Take the result of the ie(may be modified) and resolve all unresolved
1077 references (URLs, playlist items).
1078
1079 It will also download the videos if 'download'.
1080 Returns the resolved ie_result.
1081 """
1082 result_type = ie_result.get('_type', 'video')
1083
1084 if result_type in ('url', 'url_transparent'):
1085 ie_result['url'] = sanitize_url(ie_result['url'])
1086 extract_flat = self.params.get('extract_flat', False)
1087 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1088 or extract_flat is True):
1089 self.__forced_printings(ie_result, self.prepare_filename(ie_result), incomplete=True)
1090 return ie_result
1091
1092 if result_type == 'video':
1093 self.add_extra_info(ie_result, extra_info)
1094 return self.process_video_result(ie_result, download=download)
1095 elif result_type == 'url':
1096 # We have to add extra_info to the results because it may be
1097 # contained in a playlist
1098 return self.extract_info(ie_result['url'],
1099 download, info_dict=ie_result,
1100 ie_key=ie_result.get('ie_key'),
1101 extra_info=extra_info)
1102 elif result_type == 'url_transparent':
1103 # Use the information from the embedding page
1104 info = self.extract_info(
1105 ie_result['url'], ie_key=ie_result.get('ie_key'),
1106 extra_info=extra_info, download=False, process=False)
1107
1108 # extract_info may return None when ignoreerrors is enabled and
1109 # extraction failed with an error, don't crash and return early
1110 # in this case
1111 if not info:
1112 return info
1113
1114 force_properties = dict(
1115 (k, v) for k, v in ie_result.items() if v is not None)
1116 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
1117 if f in force_properties:
1118 del force_properties[f]
1119 new_result = info.copy()
1120 new_result.update(force_properties)
1121
1122 # Extracted info may not be a video result (i.e.
1123 # info.get('_type', 'video') != video) but rather an url or
1124 # url_transparent. In such cases outer metadata (from ie_result)
1125 # should be propagated to inner one (info). For this to happen
1126 # _type of info should be overridden with url_transparent. This
1127 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1128 if new_result.get('_type') == 'url':
1129 new_result['_type'] = 'url_transparent'
1130
1131 return self.process_ie_result(
1132 new_result, download=download, extra_info=extra_info)
1133 elif result_type in ('playlist', 'multi_video'):
1134 # Protect from infinite recursion due to recursively nested playlists
1135 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1136 webpage_url = ie_result['webpage_url']
1137 if webpage_url in self._playlist_urls:
1138 self.to_screen(
1139 '[download] Skipping already downloaded playlist: %s'
1140 % ie_result.get('title') or ie_result.get('id'))
1141 return
1142
1143 self._playlist_level += 1
1144 self._playlist_urls.add(webpage_url)
1145 try:
1146 return self.__process_playlist(ie_result, download)
1147 finally:
1148 self._playlist_level -= 1
1149 if not self._playlist_level:
1150 self._playlist_urls.clear()
1151 elif result_type == 'compat_list':
1152 self.report_warning(
1153 'Extractor %s returned a compat_list result. '
1154 'It needs to be updated.' % ie_result.get('extractor'))
1155
1156 def _fixup(r):
1157 self.add_extra_info(
1158 r,
1159 {
1160 'extractor': ie_result['extractor'],
1161 'webpage_url': ie_result['webpage_url'],
1162 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1163 'extractor_key': ie_result['extractor_key'],
1164 }
1165 )
1166 return r
1167 ie_result['entries'] = [
1168 self.process_ie_result(_fixup(r), download, extra_info)
1169 for r in ie_result['entries']
1170 ]
1171 return ie_result
1172 else:
1173 raise Exception('Invalid result type: %s' % result_type)
1174
1175 def _ensure_dir_exists(self, path):
1176 return make_dir(path, self.report_error)
1177
1178 def __process_playlist(self, ie_result, download):
1179 # We process each entry in the playlist
1180 playlist = ie_result.get('title') or ie_result.get('id')
1181 self.to_screen('[download] Downloading playlist: %s' % playlist)
1182
1183 if self.params.get('allow_playlist_files', True):
1184 ie_copy = {
1185 'playlist': playlist,
1186 'playlist_id': ie_result.get('id'),
1187 'playlist_title': ie_result.get('title'),
1188 'playlist_uploader': ie_result.get('uploader'),
1189 'playlist_uploader_id': ie_result.get('uploader_id'),
1190 'playlist_index': 0
1191 }
1192 ie_copy.update(dict(ie_result))
1193
1194 if self.params.get('writeinfojson', False):
1195 infofn = self.prepare_filename(ie_copy, 'pl_infojson')
1196 if not self._ensure_dir_exists(encodeFilename(infofn)):
1197 return
1198 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
1199 self.to_screen('[info] Playlist metadata is already present')
1200 else:
1201 playlist_info = dict(ie_result)
1202 # playlist_info['entries'] = list(playlist_info['entries']) # Entries is a generator which shouldnot be resolved here
1203 self.to_screen('[info] Writing playlist metadata as JSON to: ' + infofn)
1204 try:
1205 write_json_file(self.filter_requested_info(playlist_info, self.params.get('clean_infojson', True)), infofn)
1206 except (OSError, IOError):
1207 self.report_error('Cannot write playlist metadata to JSON file ' + infofn)
1208
1209 if self.params.get('writedescription', False):
1210 descfn = self.prepare_filename(ie_copy, 'pl_description')
1211 if not self._ensure_dir_exists(encodeFilename(descfn)):
1212 return
1213 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
1214 self.to_screen('[info] Playlist description is already present')
1215 elif ie_result.get('description') is None:
1216 self.report_warning('There\'s no playlist description to write.')
1217 else:
1218 try:
1219 self.to_screen('[info] Writing playlist description to: ' + descfn)
1220 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1221 descfile.write(ie_result['description'])
1222 except (OSError, IOError):
1223 self.report_error('Cannot write playlist description file ' + descfn)
1224 return
1225
1226 playlist_results = []
1227
1228 playliststart = self.params.get('playliststart', 1) - 1
1229 playlistend = self.params.get('playlistend')
1230 # For backwards compatibility, interpret -1 as whole list
1231 if playlistend == -1:
1232 playlistend = None
1233
1234 playlistitems_str = self.params.get('playlist_items')
1235 playlistitems = None
1236 if playlistitems_str is not None:
1237 def iter_playlistitems(format):
1238 for string_segment in format.split(','):
1239 if '-' in string_segment:
1240 start, end = string_segment.split('-')
1241 for item in range(int(start), int(end) + 1):
1242 yield int(item)
1243 else:
1244 yield int(string_segment)
1245 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
1246
1247 ie_entries = ie_result['entries']
1248
1249 def make_playlistitems_entries(list_ie_entries):
1250 num_entries = len(list_ie_entries)
1251 return [
1252 list_ie_entries[i - 1] for i in playlistitems
1253 if -num_entries <= i - 1 < num_entries]
1254
1255 def report_download(num_entries):
1256 self.to_screen(
1257 '[%s] playlist %s: Downloading %d videos' %
1258 (ie_result['extractor'], playlist, num_entries))
1259
1260 if isinstance(ie_entries, list):
1261 n_all_entries = len(ie_entries)
1262 if playlistitems:
1263 entries = make_playlistitems_entries(ie_entries)
1264 else:
1265 entries = ie_entries[playliststart:playlistend]
1266 n_entries = len(entries)
1267 self.to_screen(
1268 '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
1269 (ie_result['extractor'], playlist, n_all_entries, n_entries))
1270 elif isinstance(ie_entries, PagedList):
1271 if playlistitems:
1272 entries = []
1273 for item in playlistitems:
1274 entries.extend(ie_entries.getslice(
1275 item - 1, item
1276 ))
1277 else:
1278 entries = ie_entries.getslice(
1279 playliststart, playlistend)
1280 n_entries = len(entries)
1281 report_download(n_entries)
1282 else: # iterable
1283 if playlistitems:
1284 entries = make_playlistitems_entries(list(itertools.islice(
1285 ie_entries, 0, max(playlistitems))))
1286 else:
1287 entries = list(itertools.islice(
1288 ie_entries, playliststart, playlistend))
1289 n_entries = len(entries)
1290 report_download(n_entries)
1291
1292 if self.params.get('playlistreverse', False):
1293 entries = entries[::-1]
1294
1295 if self.params.get('playlistrandom', False):
1296 random.shuffle(entries)
1297
1298 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1299
1300 for i, entry in enumerate(entries, 1):
1301 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1302 # This __x_forwarded_for_ip thing is a bit ugly but requires
1303 # minimal changes
1304 if x_forwarded_for:
1305 entry['__x_forwarded_for_ip'] = x_forwarded_for
1306 extra = {
1307 'n_entries': n_entries,
1308 'playlist': playlist,
1309 'playlist_id': ie_result.get('id'),
1310 'playlist_title': ie_result.get('title'),
1311 'playlist_uploader': ie_result.get('uploader'),
1312 'playlist_uploader_id': ie_result.get('uploader_id'),
1313 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
1314 'extractor': ie_result['extractor'],
1315 'webpage_url': ie_result['webpage_url'],
1316 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1317 'extractor_key': ie_result['extractor_key'],
1318 }
1319
1320 if self._match_entry(entry, incomplete=True) is not None:
1321 continue
1322
1323 entry_result = self.__process_iterable_entry(entry, download, extra)
1324 # TODO: skip failed (empty) entries?
1325 playlist_results.append(entry_result)
1326 ie_result['entries'] = playlist_results
1327 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1328 return ie_result
1329
1330 @__handle_extraction_exceptions
1331 def __process_iterable_entry(self, entry, download, extra_info):
1332 return self.process_ie_result(
1333 entry, download=download, extra_info=extra_info)
1334
1335 def _build_format_filter(self, filter_spec):
1336 " Returns a function to filter the formats according to the filter_spec "
1337
1338 OPERATORS = {
1339 '<': operator.lt,
1340 '<=': operator.le,
1341 '>': operator.gt,
1342 '>=': operator.ge,
1343 '=': operator.eq,
1344 '!=': operator.ne,
1345 }
1346 operator_rex = re.compile(r'''(?x)\s*
1347 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
1348 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1349 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1350 $
1351 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1352 m = operator_rex.search(filter_spec)
1353 if m:
1354 try:
1355 comparison_value = int(m.group('value'))
1356 except ValueError:
1357 comparison_value = parse_filesize(m.group('value'))
1358 if comparison_value is None:
1359 comparison_value = parse_filesize(m.group('value') + 'B')
1360 if comparison_value is None:
1361 raise ValueError(
1362 'Invalid value %r in format specification %r' % (
1363 m.group('value'), filter_spec))
1364 op = OPERATORS[m.group('op')]
1365
1366 if not m:
1367 STR_OPERATORS = {
1368 '=': operator.eq,
1369 '^=': lambda attr, value: attr.startswith(value),
1370 '$=': lambda attr, value: attr.endswith(value),
1371 '*=': lambda attr, value: value in attr,
1372 }
1373 str_operator_rex = re.compile(r'''(?x)
1374 \s*(?P<key>[a-zA-Z0-9._-]+)
1375 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
1376 \s*(?P<value>[a-zA-Z0-9._-]+)
1377 \s*$
1378 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1379 m = str_operator_rex.search(filter_spec)
1380 if m:
1381 comparison_value = m.group('value')
1382 str_op = STR_OPERATORS[m.group('op')]
1383 if m.group('negation'):
1384 op = lambda attr, value: not str_op(attr, value)
1385 else:
1386 op = str_op
1387
1388 if not m:
1389 raise ValueError('Invalid filter specification %r' % filter_spec)
1390
1391 def _filter(f):
1392 actual_value = f.get(m.group('key'))
1393 if actual_value is None:
1394 return m.group('none_inclusive')
1395 return op(actual_value, comparison_value)
1396 return _filter
1397
1398 def _default_format_spec(self, info_dict, download=True):
1399
1400 def can_merge():
1401 merger = FFmpegMergerPP(self)
1402 return merger.available and merger.can_merge()
1403
1404 prefer_best = (
1405 not self.params.get('simulate', False)
1406 and download
1407 and (
1408 not can_merge()
1409 or info_dict.get('is_live', False)
1410 or self.outtmpl_dict['default'] == '-'))
1411
1412 return (
1413 'best/bestvideo+bestaudio'
1414 if prefer_best
1415 else 'bestvideo*+bestaudio/best'
1416 if not self.params.get('allow_multiple_audio_streams', False)
1417 else 'bestvideo+bestaudio/best')
1418
1419 def build_format_selector(self, format_spec):
1420 def syntax_error(note, start):
1421 message = (
1422 'Invalid format specification: '
1423 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1424 return SyntaxError(message)
1425
1426 PICKFIRST = 'PICKFIRST'
1427 MERGE = 'MERGE'
1428 SINGLE = 'SINGLE'
1429 GROUP = 'GROUP'
1430 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1431
1432 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
1433 'video': self.params.get('allow_multiple_video_streams', False)}
1434
1435 def _parse_filter(tokens):
1436 filter_parts = []
1437 for type, string, start, _, _ in tokens:
1438 if type == tokenize.OP and string == ']':
1439 return ''.join(filter_parts)
1440 else:
1441 filter_parts.append(string)
1442
1443 def _remove_unused_ops(tokens):
1444 # Remove operators that we don't use and join them with the surrounding strings
1445 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1446 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1447 last_string, last_start, last_end, last_line = None, None, None, None
1448 for type, string, start, end, line in tokens:
1449 if type == tokenize.OP and string == '[':
1450 if last_string:
1451 yield tokenize.NAME, last_string, last_start, last_end, last_line
1452 last_string = None
1453 yield type, string, start, end, line
1454 # everything inside brackets will be handled by _parse_filter
1455 for type, string, start, end, line in tokens:
1456 yield type, string, start, end, line
1457 if type == tokenize.OP and string == ']':
1458 break
1459 elif type == tokenize.OP and string in ALLOWED_OPS:
1460 if last_string:
1461 yield tokenize.NAME, last_string, last_start, last_end, last_line
1462 last_string = None
1463 yield type, string, start, end, line
1464 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1465 if not last_string:
1466 last_string = string
1467 last_start = start
1468 last_end = end
1469 else:
1470 last_string += string
1471 if last_string:
1472 yield tokenize.NAME, last_string, last_start, last_end, last_line
1473
1474 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1475 selectors = []
1476 current_selector = None
1477 for type, string, start, _, _ in tokens:
1478 # ENCODING is only defined in python 3.x
1479 if type == getattr(tokenize, 'ENCODING', None):
1480 continue
1481 elif type in [tokenize.NAME, tokenize.NUMBER]:
1482 current_selector = FormatSelector(SINGLE, string, [])
1483 elif type == tokenize.OP:
1484 if string == ')':
1485 if not inside_group:
1486 # ')' will be handled by the parentheses group
1487 tokens.restore_last_token()
1488 break
1489 elif inside_merge and string in ['/', ',']:
1490 tokens.restore_last_token()
1491 break
1492 elif inside_choice and string == ',':
1493 tokens.restore_last_token()
1494 break
1495 elif string == ',':
1496 if not current_selector:
1497 raise syntax_error('"," must follow a format selector', start)
1498 selectors.append(current_selector)
1499 current_selector = None
1500 elif string == '/':
1501 if not current_selector:
1502 raise syntax_error('"/" must follow a format selector', start)
1503 first_choice = current_selector
1504 second_choice = _parse_format_selection(tokens, inside_choice=True)
1505 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1506 elif string == '[':
1507 if not current_selector:
1508 current_selector = FormatSelector(SINGLE, 'best', [])
1509 format_filter = _parse_filter(tokens)
1510 current_selector.filters.append(format_filter)
1511 elif string == '(':
1512 if current_selector:
1513 raise syntax_error('Unexpected "("', start)
1514 group = _parse_format_selection(tokens, inside_group=True)
1515 current_selector = FormatSelector(GROUP, group, [])
1516 elif string == '+':
1517 if not current_selector:
1518 raise syntax_error('Unexpected "+"', start)
1519 selector_1 = current_selector
1520 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1521 if not selector_2:
1522 raise syntax_error('Expected a selector', start)
1523 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1524 else:
1525 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1526 elif type == tokenize.ENDMARKER:
1527 break
1528 if current_selector:
1529 selectors.append(current_selector)
1530 return selectors
1531
1532 def _build_selector_function(selector):
1533 if isinstance(selector, list): # ,
1534 fs = [_build_selector_function(s) for s in selector]
1535
1536 def selector_function(ctx):
1537 for f in fs:
1538 for format in f(ctx):
1539 yield format
1540 return selector_function
1541
1542 elif selector.type == GROUP: # ()
1543 selector_function = _build_selector_function(selector.selector)
1544
1545 elif selector.type == PICKFIRST: # /
1546 fs = [_build_selector_function(s) for s in selector.selector]
1547
1548 def selector_function(ctx):
1549 for f in fs:
1550 picked_formats = list(f(ctx))
1551 if picked_formats:
1552 return picked_formats
1553 return []
1554
1555 elif selector.type == SINGLE: # atom
1556 format_spec = selector.selector if selector.selector is not None else 'best'
1557
1558 if format_spec == 'all':
1559 def selector_function(ctx):
1560 formats = list(ctx['formats'])
1561 if formats:
1562 for f in formats:
1563 yield f
1564
1565 else:
1566 format_fallback = False
1567 format_spec_obj = re.match(r'(best|worst|b|w)(video|audio|v|a)?(\*)?$', format_spec)
1568 if format_spec_obj is not None:
1569 format_idx = 0 if format_spec_obj.group(1)[0] == 'w' else -1
1570 format_type = format_spec_obj.group(2)[0] if format_spec_obj.group(2) else False
1571 not_format_type = 'v' if format_type == 'a' else 'a'
1572 format_modified = format_spec_obj.group(3) is not None
1573
1574 format_fallback = not format_type and not format_modified # for b, w
1575 filter_f = ((lambda f: f.get(format_type + 'codec') != 'none')
1576 if format_type and format_modified # bv*, ba*, wv*, wa*
1577 else (lambda f: f.get(not_format_type + 'codec') == 'none')
1578 if format_type # bv, ba, wv, wa
1579 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1580 if not format_modified # b, w
1581 else None) # b*, w*
1582 else:
1583 format_idx = -1
1584 filter_f = ((lambda f: f.get('ext') == format_spec)
1585 if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension
1586 else (lambda f: f.get('format_id') == format_spec)) # id
1587
1588 def selector_function(ctx):
1589 formats = list(ctx['formats'])
1590 if not formats:
1591 return
1592 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1593 if matches:
1594 yield matches[format_idx]
1595 elif format_fallback == 'force' or (format_fallback and ctx['incomplete_formats']):
1596 # for extractors with incomplete formats (audio only (soundcloud)
1597 # or video only (imgur)) best/worst will fallback to
1598 # best/worst {video,audio}-only format
1599 yield formats[format_idx]
1600
1601 elif selector.type == MERGE: # +
1602 def _merge(formats_pair):
1603 format_1, format_2 = formats_pair
1604
1605 formats_info = []
1606 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1607 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1608
1609 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1610 get_no_more = {"video": False, "audio": False}
1611 for (i, fmt_info) in enumerate(formats_info):
1612 for aud_vid in ["audio", "video"]:
1613 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1614 if get_no_more[aud_vid]:
1615 formats_info.pop(i)
1616 get_no_more[aud_vid] = True
1617
1618 if len(formats_info) == 1:
1619 return formats_info[0]
1620
1621 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1622 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1623
1624 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1625 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1626
1627 output_ext = self.params.get('merge_output_format')
1628 if not output_ext:
1629 if the_only_video:
1630 output_ext = the_only_video['ext']
1631 elif the_only_audio and not video_fmts:
1632 output_ext = the_only_audio['ext']
1633 else:
1634 output_ext = 'mkv'
1635
1636 new_dict = {
1637 'requested_formats': formats_info,
1638 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1639 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1640 'ext': output_ext,
1641 }
1642
1643 if the_only_video:
1644 new_dict.update({
1645 'width': the_only_video.get('width'),
1646 'height': the_only_video.get('height'),
1647 'resolution': the_only_video.get('resolution'),
1648 'fps': the_only_video.get('fps'),
1649 'vcodec': the_only_video.get('vcodec'),
1650 'vbr': the_only_video.get('vbr'),
1651 'stretched_ratio': the_only_video.get('stretched_ratio'),
1652 })
1653
1654 if the_only_audio:
1655 new_dict.update({
1656 'acodec': the_only_audio.get('acodec'),
1657 'abr': the_only_audio.get('abr'),
1658 })
1659
1660 return new_dict
1661
1662 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1663
1664 def selector_function(ctx):
1665 for pair in itertools.product(
1666 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1667 yield _merge(pair)
1668
1669 filters = [self._build_format_filter(f) for f in selector.filters]
1670
1671 def final_selector(ctx):
1672 ctx_copy = copy.deepcopy(ctx)
1673 for _filter in filters:
1674 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1675 return selector_function(ctx_copy)
1676 return final_selector
1677
1678 stream = io.BytesIO(format_spec.encode('utf-8'))
1679 try:
1680 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1681 except tokenize.TokenError:
1682 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1683
1684 class TokenIterator(object):
1685 def __init__(self, tokens):
1686 self.tokens = tokens
1687 self.counter = 0
1688
1689 def __iter__(self):
1690 return self
1691
1692 def __next__(self):
1693 if self.counter >= len(self.tokens):
1694 raise StopIteration()
1695 value = self.tokens[self.counter]
1696 self.counter += 1
1697 return value
1698
1699 next = __next__
1700
1701 def restore_last_token(self):
1702 self.counter -= 1
1703
1704 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1705 return _build_selector_function(parsed_selector)
1706
1707 def _calc_headers(self, info_dict):
1708 res = std_headers.copy()
1709
1710 add_headers = info_dict.get('http_headers')
1711 if add_headers:
1712 res.update(add_headers)
1713
1714 cookies = self._calc_cookies(info_dict)
1715 if cookies:
1716 res['Cookie'] = cookies
1717
1718 if 'X-Forwarded-For' not in res:
1719 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1720 if x_forwarded_for_ip:
1721 res['X-Forwarded-For'] = x_forwarded_for_ip
1722
1723 return res
1724
1725 def _calc_cookies(self, info_dict):
1726 pr = sanitized_Request(info_dict['url'])
1727 self.cookiejar.add_cookie_header(pr)
1728 return pr.get_header('Cookie')
1729
1730 def process_video_result(self, info_dict, download=True):
1731 assert info_dict.get('_type', 'video') == 'video'
1732
1733 if 'id' not in info_dict:
1734 raise ExtractorError('Missing "id" field in extractor result')
1735 if 'title' not in info_dict:
1736 raise ExtractorError('Missing "title" field in extractor result')
1737
1738 def report_force_conversion(field, field_not, conversion):
1739 self.report_warning(
1740 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1741 % (field, field_not, conversion))
1742
1743 def sanitize_string_field(info, string_field):
1744 field = info.get(string_field)
1745 if field is None or isinstance(field, compat_str):
1746 return
1747 report_force_conversion(string_field, 'a string', 'string')
1748 info[string_field] = compat_str(field)
1749
1750 def sanitize_numeric_fields(info):
1751 for numeric_field in self._NUMERIC_FIELDS:
1752 field = info.get(numeric_field)
1753 if field is None or isinstance(field, compat_numeric_types):
1754 continue
1755 report_force_conversion(numeric_field, 'numeric', 'int')
1756 info[numeric_field] = int_or_none(field)
1757
1758 sanitize_string_field(info_dict, 'id')
1759 sanitize_numeric_fields(info_dict)
1760
1761 if 'playlist' not in info_dict:
1762 # It isn't part of a playlist
1763 info_dict['playlist'] = None
1764 info_dict['playlist_index'] = None
1765
1766 thumbnails = info_dict.get('thumbnails')
1767 if thumbnails is None:
1768 thumbnail = info_dict.get('thumbnail')
1769 if thumbnail:
1770 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1771 if thumbnails:
1772 thumbnails.sort(key=lambda t: (
1773 t.get('preference') if t.get('preference') is not None else -1,
1774 t.get('width') if t.get('width') is not None else -1,
1775 t.get('height') if t.get('height') is not None else -1,
1776 t.get('id') if t.get('id') is not None else '', t.get('url')))
1777 for i, t in enumerate(thumbnails):
1778 t['url'] = sanitize_url(t['url'])
1779 if t.get('width') and t.get('height'):
1780 t['resolution'] = '%dx%d' % (t['width'], t['height'])
1781 if t.get('id') is None:
1782 t['id'] = '%d' % i
1783
1784 if self.params.get('list_thumbnails'):
1785 self.list_thumbnails(info_dict)
1786 return
1787
1788 thumbnail = info_dict.get('thumbnail')
1789 if thumbnail:
1790 info_dict['thumbnail'] = sanitize_url(thumbnail)
1791 elif thumbnails:
1792 info_dict['thumbnail'] = thumbnails[-1]['url']
1793
1794 if 'display_id' not in info_dict and 'id' in info_dict:
1795 info_dict['display_id'] = info_dict['id']
1796
1797 for ts_key, date_key in (
1798 ('timestamp', 'upload_date'),
1799 ('release_timestamp', 'release_date'),
1800 ):
1801 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
1802 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1803 # see http://bugs.python.org/issue1646728)
1804 try:
1805 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
1806 info_dict[date_key] = upload_date.strftime('%Y%m%d')
1807 except (ValueError, OverflowError, OSError):
1808 pass
1809
1810 # Auto generate title fields corresponding to the *_number fields when missing
1811 # in order to always have clean titles. This is very common for TV series.
1812 for field in ('chapter', 'season', 'episode'):
1813 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1814 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1815
1816 for cc_kind in ('subtitles', 'automatic_captions'):
1817 cc = info_dict.get(cc_kind)
1818 if cc:
1819 for _, subtitle in cc.items():
1820 for subtitle_format in subtitle:
1821 if subtitle_format.get('url'):
1822 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1823 if subtitle_format.get('ext') is None:
1824 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1825
1826 automatic_captions = info_dict.get('automatic_captions')
1827 subtitles = info_dict.get('subtitles')
1828
1829 if self.params.get('listsubtitles', False):
1830 if 'automatic_captions' in info_dict:
1831 self.list_subtitles(
1832 info_dict['id'], automatic_captions, 'automatic captions')
1833 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1834 return
1835
1836 info_dict['requested_subtitles'] = self.process_subtitles(
1837 info_dict['id'], subtitles, automatic_captions)
1838
1839 # We now pick which formats have to be downloaded
1840 if info_dict.get('formats') is None:
1841 # There's only one format available
1842 formats = [info_dict]
1843 else:
1844 formats = info_dict['formats']
1845
1846 if not formats:
1847 raise ExtractorError('No video formats found!')
1848
1849 def is_wellformed(f):
1850 url = f.get('url')
1851 if not url:
1852 self.report_warning(
1853 '"url" field is missing or empty - skipping format, '
1854 'there is an error in extractor')
1855 return False
1856 if isinstance(url, bytes):
1857 sanitize_string_field(f, 'url')
1858 return True
1859
1860 # Filter out malformed formats for better extraction robustness
1861 formats = list(filter(is_wellformed, formats))
1862
1863 formats_dict = {}
1864
1865 # We check that all the formats have the format and format_id fields
1866 for i, format in enumerate(formats):
1867 sanitize_string_field(format, 'format_id')
1868 sanitize_numeric_fields(format)
1869 format['url'] = sanitize_url(format['url'])
1870 if not format.get('format_id'):
1871 format['format_id'] = compat_str(i)
1872 else:
1873 # Sanitize format_id from characters used in format selector expression
1874 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1875 format_id = format['format_id']
1876 if format_id not in formats_dict:
1877 formats_dict[format_id] = []
1878 formats_dict[format_id].append(format)
1879
1880 # Make sure all formats have unique format_id
1881 for format_id, ambiguous_formats in formats_dict.items():
1882 if len(ambiguous_formats) > 1:
1883 for i, format in enumerate(ambiguous_formats):
1884 format['format_id'] = '%s-%d' % (format_id, i)
1885
1886 for i, format in enumerate(formats):
1887 if format.get('format') is None:
1888 format['format'] = '{id} - {res}{note}'.format(
1889 id=format['format_id'],
1890 res=self.format_resolution(format),
1891 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1892 )
1893 # Automatically determine file extension if missing
1894 if format.get('ext') is None:
1895 format['ext'] = determine_ext(format['url']).lower()
1896 # Automatically determine protocol if missing (useful for format
1897 # selection purposes)
1898 if format.get('protocol') is None:
1899 format['protocol'] = determine_protocol(format)
1900 # Add HTTP headers, so that external programs can use them from the
1901 # json output
1902 full_format_info = info_dict.copy()
1903 full_format_info.update(format)
1904 format['http_headers'] = self._calc_headers(full_format_info)
1905 # Remove private housekeeping stuff
1906 if '__x_forwarded_for_ip' in info_dict:
1907 del info_dict['__x_forwarded_for_ip']
1908
1909 # TODO Central sorting goes here
1910
1911 if formats[0] is not info_dict:
1912 # only set the 'formats' fields if the original info_dict list them
1913 # otherwise we end up with a circular reference, the first (and unique)
1914 # element in the 'formats' field in info_dict is info_dict itself,
1915 # which can't be exported to json
1916 info_dict['formats'] = formats
1917 if self.params.get('listformats'):
1918 self.list_formats(info_dict)
1919 return
1920
1921 req_format = self.params.get('format')
1922 if req_format is None:
1923 req_format = self._default_format_spec(info_dict, download=download)
1924 if self.params.get('verbose'):
1925 self.to_screen('[debug] Default format spec: %s' % req_format)
1926
1927 format_selector = self.build_format_selector(req_format)
1928
1929 # While in format selection we may need to have an access to the original
1930 # format set in order to calculate some metrics or do some processing.
1931 # For now we need to be able to guess whether original formats provided
1932 # by extractor are incomplete or not (i.e. whether extractor provides only
1933 # video-only or audio-only formats) for proper formats selection for
1934 # extractors with such incomplete formats (see
1935 # https://github.com/ytdl-org/youtube-dl/pull/5556).
1936 # Since formats may be filtered during format selection and may not match
1937 # the original formats the results may be incorrect. Thus original formats
1938 # or pre-calculated metrics should be passed to format selection routines
1939 # as well.
1940 # We will pass a context object containing all necessary additional data
1941 # instead of just formats.
1942 # This fixes incorrect format selection issue (see
1943 # https://github.com/ytdl-org/youtube-dl/issues/10083).
1944 incomplete_formats = (
1945 # All formats are video-only or
1946 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
1947 # all formats are audio-only
1948 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1949
1950 ctx = {
1951 'formats': formats,
1952 'incomplete_formats': incomplete_formats,
1953 }
1954
1955 formats_to_download = list(format_selector(ctx))
1956 if not formats_to_download:
1957 raise ExtractorError('requested format not available',
1958 expected=True)
1959
1960 if download:
1961 self.to_screen('[info] Downloading format(s) %s' % ", ".join([f['format_id'] for f in formats_to_download]))
1962 if len(formats_to_download) > 1:
1963 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1964 for format in formats_to_download:
1965 new_info = dict(info_dict)
1966 new_info.update(format)
1967 self.process_info(new_info)
1968 # We update the info dict with the best quality format (backwards compatibility)
1969 info_dict.update(formats_to_download[-1])
1970 return info_dict
1971
1972 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1973 """Select the requested subtitles and their format"""
1974 available_subs = {}
1975 if normal_subtitles and self.params.get('writesubtitles'):
1976 available_subs.update(normal_subtitles)
1977 if automatic_captions and self.params.get('writeautomaticsub'):
1978 for lang, cap_info in automatic_captions.items():
1979 if lang not in available_subs:
1980 available_subs[lang] = cap_info
1981
1982 if (not self.params.get('writesubtitles') and not
1983 self.params.get('writeautomaticsub') or not
1984 available_subs):
1985 return None
1986
1987 if self.params.get('allsubtitles', False):
1988 requested_langs = available_subs.keys()
1989 else:
1990 if self.params.get('subtitleslangs', False):
1991 requested_langs = self.params.get('subtitleslangs')
1992 elif 'en' in available_subs:
1993 requested_langs = ['en']
1994 else:
1995 requested_langs = [list(available_subs.keys())[0]]
1996
1997 formats_query = self.params.get('subtitlesformat', 'best')
1998 formats_preference = formats_query.split('/') if formats_query else []
1999 subs = {}
2000 for lang in requested_langs:
2001 formats = available_subs.get(lang)
2002 if formats is None:
2003 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
2004 continue
2005 for ext in formats_preference:
2006 if ext == 'best':
2007 f = formats[-1]
2008 break
2009 matches = list(filter(lambda f: f['ext'] == ext, formats))
2010 if matches:
2011 f = matches[-1]
2012 break
2013 else:
2014 f = formats[-1]
2015 self.report_warning(
2016 'No subtitle format found matching "%s" for language %s, '
2017 'using %s' % (formats_query, lang, f['ext']))
2018 subs[lang] = f
2019 return subs
2020
2021 def __forced_printings(self, info_dict, filename, incomplete):
2022 def print_mandatory(field):
2023 if (self.params.get('force%s' % field, False)
2024 and (not incomplete or info_dict.get(field) is not None)):
2025 self.to_stdout(info_dict[field])
2026
2027 def print_optional(field):
2028 if (self.params.get('force%s' % field, False)
2029 and info_dict.get(field) is not None):
2030 self.to_stdout(info_dict[field])
2031
2032 print_mandatory('title')
2033 print_mandatory('id')
2034 if self.params.get('forceurl', False) and not incomplete:
2035 if info_dict.get('requested_formats') is not None:
2036 for f in info_dict['requested_formats']:
2037 self.to_stdout(f['url'] + f.get('play_path', ''))
2038 else:
2039 # For RTMP URLs, also include the playpath
2040 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
2041 print_optional('thumbnail')
2042 print_optional('description')
2043 if self.params.get('forcefilename', False) and filename is not None:
2044 self.to_stdout(filename)
2045 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
2046 self.to_stdout(formatSeconds(info_dict['duration']))
2047 print_mandatory('format')
2048 if self.params.get('forcejson', False):
2049 self.post_extract(info_dict)
2050 self.to_stdout(json.dumps(info_dict, default=repr))
2051
2052 def process_info(self, info_dict):
2053 """Process a single resolved IE result."""
2054
2055 assert info_dict.get('_type', 'video') == 'video'
2056
2057 info_dict.setdefault('__postprocessors', [])
2058
2059 max_downloads = self.params.get('max_downloads')
2060 if max_downloads is not None:
2061 if self._num_downloads >= int(max_downloads):
2062 raise MaxDownloadsReached()
2063
2064 # TODO: backward compatibility, to be removed
2065 info_dict['fulltitle'] = info_dict['title']
2066
2067 if 'format' not in info_dict:
2068 info_dict['format'] = info_dict['ext']
2069
2070 if self._match_entry(info_dict, incomplete=False) is not None:
2071 return
2072
2073 self.post_extract(info_dict)
2074 self._num_downloads += 1
2075
2076 info_dict = self.pre_process(info_dict)
2077
2078 # info_dict['_filename'] needs to be set for backward compatibility
2079 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
2080 temp_filename = self.prepare_filename(info_dict, 'temp')
2081 files_to_move = {}
2082 skip_dl = self.params.get('skip_download', False)
2083
2084 # Forced printings
2085 self.__forced_printings(info_dict, full_filename, incomplete=False)
2086
2087 if self.params.get('simulate', False):
2088 if self.params.get('force_write_download_archive', False):
2089 self.record_download_archive(info_dict)
2090
2091 # Do nothing else if in simulate mode
2092 return
2093
2094 if full_filename is None:
2095 return
2096
2097 if not self._ensure_dir_exists(encodeFilename(full_filename)):
2098 return
2099 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
2100 return
2101
2102 if self.params.get('writedescription', False):
2103 descfn = self.prepare_filename(info_dict, 'description')
2104 if not self._ensure_dir_exists(encodeFilename(descfn)):
2105 return
2106 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)):
2107 self.to_screen('[info] Video description is already present')
2108 elif info_dict.get('description') is None:
2109 self.report_warning('There\'s no description to write.')
2110 else:
2111 try:
2112 self.to_screen('[info] Writing video description to: ' + descfn)
2113 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
2114 descfile.write(info_dict['description'])
2115 except (OSError, IOError):
2116 self.report_error('Cannot write description file ' + descfn)
2117 return
2118
2119 if self.params.get('writeannotations', False):
2120 annofn = self.prepare_filename(info_dict, 'annotation')
2121 if not self._ensure_dir_exists(encodeFilename(annofn)):
2122 return
2123 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
2124 self.to_screen('[info] Video annotations are already present')
2125 elif not info_dict.get('annotations'):
2126 self.report_warning('There are no annotations to write.')
2127 else:
2128 try:
2129 self.to_screen('[info] Writing video annotations to: ' + annofn)
2130 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
2131 annofile.write(info_dict['annotations'])
2132 except (KeyError, TypeError):
2133 self.report_warning('There are no annotations to write.')
2134 except (OSError, IOError):
2135 self.report_error('Cannot write annotations file: ' + annofn)
2136 return
2137
2138 def dl(name, info, subtitle=False):
2139 fd = get_suitable_downloader(info, self.params)(self, self.params)
2140 for ph in self._progress_hooks:
2141 fd.add_progress_hook(ph)
2142 if self.params.get('verbose'):
2143 self.to_screen('[debug] Invoking downloader on %r' % info.get('url'))
2144 return fd.download(name, info, subtitle)
2145
2146 subtitles_are_requested = any([self.params.get('writesubtitles', False),
2147 self.params.get('writeautomaticsub')])
2148
2149 if subtitles_are_requested and info_dict.get('requested_subtitles'):
2150 # subtitles download errors are already managed as troubles in relevant IE
2151 # that way it will silently go on when used with unsupporting IE
2152 subtitles = info_dict['requested_subtitles']
2153 # ie = self.get_info_extractor(info_dict['extractor_key'])
2154 for sub_lang, sub_info in subtitles.items():
2155 sub_format = sub_info['ext']
2156 sub_fn = self.prepare_filename(info_dict, 'subtitle')
2157 sub_filename = subtitles_filename(
2158 temp_filename if not skip_dl else sub_fn,
2159 sub_lang, sub_format, info_dict.get('ext'))
2160 sub_filename_final = subtitles_filename(sub_fn, sub_lang, sub_format, info_dict.get('ext'))
2161 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(sub_filename)):
2162 self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
2163 sub_info['filepath'] = sub_filename
2164 files_to_move[sub_filename] = sub_filename_final
2165 else:
2166 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
2167 if sub_info.get('data') is not None:
2168 try:
2169 # Use newline='' to prevent conversion of newline characters
2170 # See https://github.com/ytdl-org/youtube-dl/issues/10268
2171 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
2172 subfile.write(sub_info['data'])
2173 sub_info['filepath'] = sub_filename
2174 files_to_move[sub_filename] = sub_filename_final
2175 except (OSError, IOError):
2176 self.report_error('Cannot write subtitles file ' + sub_filename)
2177 return
2178 else:
2179 try:
2180 dl(sub_filename, sub_info.copy(), subtitle=True)
2181 sub_info['filepath'] = sub_filename
2182 files_to_move[sub_filename] = sub_filename_final
2183 except (ExtractorError, IOError, OSError, ValueError, compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2184 self.report_warning('Unable to download subtitle for "%s": %s' %
2185 (sub_lang, error_to_compat_str(err)))
2186 continue
2187
2188 if skip_dl:
2189 if self.params.get('convertsubtitles', False):
2190 # subconv = FFmpegSubtitlesConvertorPP(self, format=self.params.get('convertsubtitles'))
2191 filename_real_ext = os.path.splitext(full_filename)[1][1:]
2192 filename_wo_ext = (
2193 os.path.splitext(full_filename)[0]
2194 if filename_real_ext == info_dict['ext']
2195 else full_filename)
2196 afilename = '%s.%s' % (filename_wo_ext, self.params.get('convertsubtitles'))
2197 # if subconv.available:
2198 # info_dict['__postprocessors'].append(subconv)
2199 if os.path.exists(encodeFilename(afilename)):
2200 self.to_screen(
2201 '[download] %s has already been downloaded and '
2202 'converted' % afilename)
2203 else:
2204 try:
2205 self.post_process(full_filename, info_dict, files_to_move)
2206 except PostProcessingError as err:
2207 self.report_error('Postprocessing: %s' % str(err))
2208 return
2209
2210 if self.params.get('writeinfojson', False):
2211 infofn = self.prepare_filename(info_dict, 'infojson')
2212 if not self._ensure_dir_exists(encodeFilename(infofn)):
2213 return
2214 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)):
2215 self.to_screen('[info] Video metadata is already present')
2216 else:
2217 self.to_screen('[info] Writing video metadata as JSON to: ' + infofn)
2218 try:
2219 write_json_file(self.filter_requested_info(info_dict, self.params.get('clean_infojson', True)), infofn)
2220 except (OSError, IOError):
2221 self.report_error('Cannot write video metadata to JSON file ' + infofn)
2222 return
2223 info_dict['__infojson_filename'] = infofn
2224
2225 thumbfn = self.prepare_filename(info_dict, 'thumbnail')
2226 thumb_fn_temp = temp_filename if not skip_dl else thumbfn
2227 for thumb_ext in self._write_thumbnails(info_dict, thumb_fn_temp):
2228 thumb_filename_temp = replace_extension(thumb_fn_temp, thumb_ext, info_dict.get('ext'))
2229 thumb_filename = replace_extension(thumbfn, thumb_ext, info_dict.get('ext'))
2230 files_to_move[thumb_filename_temp] = thumb_filename
2231
2232 # Write internet shortcut files
2233 url_link = webloc_link = desktop_link = False
2234 if self.params.get('writelink', False):
2235 if sys.platform == "darwin": # macOS.
2236 webloc_link = True
2237 elif sys.platform.startswith("linux"):
2238 desktop_link = True
2239 else: # if sys.platform in ['win32', 'cygwin']:
2240 url_link = True
2241 if self.params.get('writeurllink', False):
2242 url_link = True
2243 if self.params.get('writewebloclink', False):
2244 webloc_link = True
2245 if self.params.get('writedesktoplink', False):
2246 desktop_link = True
2247
2248 if url_link or webloc_link or desktop_link:
2249 if 'webpage_url' not in info_dict:
2250 self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
2251 return
2252 ascii_url = iri_to_uri(info_dict['webpage_url'])
2253
2254 def _write_link_file(extension, template, newline, embed_filename):
2255 linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
2256 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
2257 self.to_screen('[info] Internet shortcut is already present')
2258 else:
2259 try:
2260 self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
2261 with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
2262 template_vars = {'url': ascii_url}
2263 if embed_filename:
2264 template_vars['filename'] = linkfn[:-(len(extension) + 1)]
2265 linkfile.write(template % template_vars)
2266 except (OSError, IOError):
2267 self.report_error('Cannot write internet shortcut ' + linkfn)
2268 return False
2269 return True
2270
2271 if url_link:
2272 if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
2273 return
2274 if webloc_link:
2275 if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
2276 return
2277 if desktop_link:
2278 if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
2279 return
2280
2281 # Download
2282 must_record_download_archive = False
2283 if not skip_dl:
2284 try:
2285
2286 def existing_file(*filepaths):
2287 ext = info_dict.get('ext')
2288 final_ext = self.params.get('final_ext', ext)
2289 existing_files = []
2290 for file in orderedSet(filepaths):
2291 if final_ext != ext:
2292 converted = replace_extension(file, final_ext, ext)
2293 if os.path.exists(encodeFilename(converted)):
2294 existing_files.append(converted)
2295 if os.path.exists(encodeFilename(file)):
2296 existing_files.append(file)
2297
2298 if not existing_files or self.params.get('overwrites', False):
2299 for file in orderedSet(existing_files):
2300 self.report_file_delete(file)
2301 os.remove(encodeFilename(file))
2302 return None
2303
2304 self.report_file_already_downloaded(existing_files[0])
2305 info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
2306 return existing_files[0]
2307
2308 success = True
2309 if info_dict.get('requested_formats') is not None:
2310 downloaded = []
2311 merger = FFmpegMergerPP(self)
2312 if self.params.get('allow_unplayable_formats'):
2313 self.report_warning(
2314 'You have requested merging of multiple formats '
2315 'while also allowing unplayable formats to be downloaded. '
2316 'The formats won\'t be merged to prevent data corruption.')
2317 elif not merger.available:
2318 self.report_warning(
2319 'You have requested merging of multiple formats but ffmpeg is not installed. '
2320 'The formats won\'t be merged.')
2321
2322 def compatible_formats(formats):
2323 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2324 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2325 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2326 if len(video_formats) > 2 or len(audio_formats) > 2:
2327 return False
2328
2329 # Check extension
2330 exts = set(format.get('ext') for format in formats)
2331 COMPATIBLE_EXTS = (
2332 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2333 set(('webm',)),
2334 )
2335 for ext_sets in COMPATIBLE_EXTS:
2336 if ext_sets.issuperset(exts):
2337 return True
2338 # TODO: Check acodec/vcodec
2339 return False
2340
2341 requested_formats = info_dict['requested_formats']
2342 old_ext = info_dict['ext']
2343 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2344 info_dict['ext'] = 'mkv'
2345 self.report_warning(
2346 'Requested formats are incompatible for merge and will be merged into mkv.')
2347
2348 def correct_ext(filename):
2349 filename_real_ext = os.path.splitext(filename)[1][1:]
2350 filename_wo_ext = (
2351 os.path.splitext(filename)[0]
2352 if filename_real_ext == old_ext
2353 else filename)
2354 return '%s.%s' % (filename_wo_ext, info_dict['ext'])
2355
2356 # Ensure filename always has a correct extension for successful merge
2357 full_filename = correct_ext(full_filename)
2358 temp_filename = correct_ext(temp_filename)
2359 dl_filename = existing_file(full_filename, temp_filename)
2360 info_dict['__real_download'] = False
2361 if dl_filename is None:
2362 for f in requested_formats:
2363 new_info = dict(info_dict)
2364 new_info.update(f)
2365 fname = prepend_extension(
2366 self.prepare_filename(new_info, 'temp'),
2367 'f%s' % f['format_id'], new_info['ext'])
2368 if not self._ensure_dir_exists(fname):
2369 return
2370 downloaded.append(fname)
2371 partial_success, real_download = dl(fname, new_info)
2372 info_dict['__real_download'] = info_dict['__real_download'] or real_download
2373 success = success and partial_success
2374 if merger.available and not self.params.get('allow_unplayable_formats'):
2375 info_dict['__postprocessors'].append(merger)
2376 info_dict['__files_to_merge'] = downloaded
2377 # Even if there were no downloads, it is being merged only now
2378 info_dict['__real_download'] = True
2379 else:
2380 for file in downloaded:
2381 files_to_move[file] = None
2382 else:
2383 # Just a single file
2384 dl_filename = existing_file(full_filename, temp_filename)
2385 if dl_filename is None:
2386 success, real_download = dl(temp_filename, info_dict)
2387 info_dict['__real_download'] = real_download
2388
2389 dl_filename = dl_filename or temp_filename
2390 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
2391
2392 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2393 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2394 return
2395 except (OSError, IOError) as err:
2396 raise UnavailableVideoError(err)
2397 except (ContentTooShortError, ) as err:
2398 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2399 return
2400
2401 if success and full_filename != '-':
2402 # Fixup content
2403 fixup_policy = self.params.get('fixup')
2404 if fixup_policy is None:
2405 fixup_policy = 'detect_or_warn'
2406
2407 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg to fix this automatically.'
2408
2409 stretched_ratio = info_dict.get('stretched_ratio')
2410 if stretched_ratio is not None and stretched_ratio != 1:
2411 if fixup_policy == 'warn':
2412 self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
2413 info_dict['id'], stretched_ratio))
2414 elif fixup_policy == 'detect_or_warn':
2415 stretched_pp = FFmpegFixupStretchedPP(self)
2416 if stretched_pp.available:
2417 info_dict['__postprocessors'].append(stretched_pp)
2418 else:
2419 self.report_warning(
2420 '%s: Non-uniform pixel ratio (%s). %s'
2421 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
2422 else:
2423 assert fixup_policy in ('ignore', 'never')
2424
2425 if (info_dict.get('requested_formats') is None
2426 and info_dict.get('container') == 'm4a_dash'
2427 and info_dict.get('ext') == 'm4a'):
2428 if fixup_policy == 'warn':
2429 self.report_warning(
2430 '%s: writing DASH m4a. '
2431 'Only some players support this container.'
2432 % info_dict['id'])
2433 elif fixup_policy == 'detect_or_warn':
2434 fixup_pp = FFmpegFixupM4aPP(self)
2435 if fixup_pp.available:
2436 info_dict['__postprocessors'].append(fixup_pp)
2437 else:
2438 self.report_warning(
2439 '%s: writing DASH m4a. '
2440 'Only some players support this container. %s'
2441 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
2442 else:
2443 assert fixup_policy in ('ignore', 'never')
2444
2445 if ('protocol' in info_dict
2446 and get_suitable_downloader(info_dict, self.params).__name__ == 'HlsFD'):
2447 if fixup_policy == 'warn':
2448 self.report_warning('%s: malformed AAC bitstream detected.' % (
2449 info_dict['id']))
2450 elif fixup_policy == 'detect_or_warn':
2451 fixup_pp = FFmpegFixupM3u8PP(self)
2452 if fixup_pp.available:
2453 info_dict['__postprocessors'].append(fixup_pp)
2454 else:
2455 self.report_warning(
2456 '%s: malformed AAC bitstream detected. %s'
2457 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
2458 else:
2459 assert fixup_policy in ('ignore', 'never')
2460
2461 try:
2462 self.post_process(dl_filename, info_dict, files_to_move)
2463 except PostProcessingError as err:
2464 self.report_error('Postprocessing: %s' % str(err))
2465 return
2466 try:
2467 for ph in self._post_hooks:
2468 ph(full_filename)
2469 except Exception as err:
2470 self.report_error('post hooks: %s' % str(err))
2471 return
2472 must_record_download_archive = True
2473
2474 if must_record_download_archive or self.params.get('force_write_download_archive', False):
2475 self.record_download_archive(info_dict)
2476 max_downloads = self.params.get('max_downloads')
2477 if max_downloads is not None and self._num_downloads >= int(max_downloads):
2478 raise MaxDownloadsReached()
2479
2480 def download(self, url_list):
2481 """Download a given list of URLs."""
2482 outtmpl = self.outtmpl_dict['default']
2483 if (len(url_list) > 1
2484 and outtmpl != '-'
2485 and '%' not in outtmpl
2486 and self.params.get('max_downloads') != 1):
2487 raise SameFileError(outtmpl)
2488
2489 for url in url_list:
2490 try:
2491 # It also downloads the videos
2492 res = self.extract_info(
2493 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2494 except UnavailableVideoError:
2495 self.report_error('unable to download video')
2496 except MaxDownloadsReached:
2497 self.to_screen('[info] Maximum number of downloaded files reached')
2498 raise
2499 except ExistingVideoReached:
2500 self.to_screen('[info] Encountered a file that is already in the archive, stopping due to --break-on-existing')
2501 raise
2502 except RejectedVideoReached:
2503 self.to_screen('[info] Encountered a file that did not match filter, stopping due to --break-on-reject')
2504 raise
2505 else:
2506 if self.params.get('dump_single_json', False):
2507 self.post_extract(res)
2508 self.to_stdout(json.dumps(res, default=repr))
2509
2510 return self._download_retcode
2511
2512 def download_with_info_file(self, info_filename):
2513 with contextlib.closing(fileinput.FileInput(
2514 [info_filename], mode='r',
2515 openhook=fileinput.hook_encoded('utf-8'))) as f:
2516 # FileInput doesn't have a read method, we can't call json.load
2517 info = self.filter_requested_info(json.loads('\n'.join(f)))
2518 try:
2519 self.process_ie_result(info, download=True)
2520 except DownloadError:
2521 webpage_url = info.get('webpage_url')
2522 if webpage_url is not None:
2523 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2524 return self.download([webpage_url])
2525 else:
2526 raise
2527 return self._download_retcode
2528
2529 @staticmethod
2530 def filter_requested_info(info_dict, actually_filter=True):
2531 if not actually_filter:
2532 return info_dict
2533 exceptions = {
2534 'remove': ['requested_formats', 'requested_subtitles', 'filepath', 'entries'],
2535 'keep': ['_type'],
2536 }
2537 keep_key = lambda k: k in exceptions['keep'] or not (k.startswith('_') or k in exceptions['remove'])
2538 filter_fn = lambda obj: (
2539 list(map(filter_fn, obj)) if isinstance(obj, (list, tuple))
2540 else obj if not isinstance(obj, dict)
2541 else dict((k, filter_fn(v)) for k, v in obj.items() if keep_key(k)))
2542 return filter_fn(info_dict)
2543
2544 def run_pp(self, pp, infodict):
2545 files_to_delete = []
2546 if '__files_to_move' not in infodict:
2547 infodict['__files_to_move'] = {}
2548 files_to_delete, infodict = pp.run(infodict)
2549 if not files_to_delete:
2550 return infodict
2551
2552 if self.params.get('keepvideo', False):
2553 for f in files_to_delete:
2554 infodict['__files_to_move'].setdefault(f, '')
2555 else:
2556 for old_filename in set(files_to_delete):
2557 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2558 try:
2559 os.remove(encodeFilename(old_filename))
2560 except (IOError, OSError):
2561 self.report_warning('Unable to remove downloaded original file')
2562 if old_filename in infodict['__files_to_move']:
2563 del infodict['__files_to_move'][old_filename]
2564 return infodict
2565
2566 @staticmethod
2567 def post_extract(info_dict):
2568 def actual_post_extract(info_dict):
2569 if info_dict.get('_type') in ('playlist', 'multi_video'):
2570 for video_dict in info_dict.get('entries', {}):
2571 actual_post_extract(video_dict)
2572 return
2573
2574 if '__post_extractor' not in info_dict:
2575 return
2576 post_extractor = info_dict['__post_extractor']
2577 if post_extractor:
2578 info_dict.update(post_extractor().items())
2579 del info_dict['__post_extractor']
2580 return
2581
2582 actual_post_extract(info_dict)
2583
2584 def pre_process(self, ie_info):
2585 info = dict(ie_info)
2586 for pp in self._pps['beforedl']:
2587 info = self.run_pp(pp, info)
2588 return info
2589
2590 def post_process(self, filename, ie_info, files_to_move=None):
2591 """Run all the postprocessors on the given file."""
2592 info = dict(ie_info)
2593 info['filepath'] = filename
2594 info['__files_to_move'] = files_to_move or {}
2595
2596 for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
2597 info = self.run_pp(pp, info)
2598 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
2599 del info['__files_to_move']
2600 for pp in self._pps['aftermove']:
2601 info = self.run_pp(pp, info)
2602
2603 def _make_archive_id(self, info_dict):
2604 video_id = info_dict.get('id')
2605 if not video_id:
2606 return
2607 # Future-proof against any change in case
2608 # and backwards compatibility with prior versions
2609 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
2610 if extractor is None:
2611 url = str_or_none(info_dict.get('url'))
2612 if not url:
2613 return
2614 # Try to find matching extractor for the URL and take its ie_key
2615 for ie in self._ies:
2616 if ie.suitable(url):
2617 extractor = ie.ie_key()
2618 break
2619 else:
2620 return
2621 return '%s %s' % (extractor.lower(), video_id)
2622
2623 def in_download_archive(self, info_dict):
2624 fn = self.params.get('download_archive')
2625 if fn is None:
2626 return False
2627
2628 vid_id = self._make_archive_id(info_dict)
2629 if not vid_id:
2630 return False # Incomplete video information
2631
2632 return vid_id in self.archive
2633
2634 def record_download_archive(self, info_dict):
2635 fn = self.params.get('download_archive')
2636 if fn is None:
2637 return
2638 vid_id = self._make_archive_id(info_dict)
2639 assert vid_id
2640 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2641 archive_file.write(vid_id + '\n')
2642 self.archive.add(vid_id)
2643
2644 @staticmethod
2645 def format_resolution(format, default='unknown'):
2646 if format.get('vcodec') == 'none':
2647 return 'audio only'
2648 if format.get('resolution') is not None:
2649 return format['resolution']
2650 if format.get('height') is not None:
2651 if format.get('width') is not None:
2652 res = '%sx%s' % (format['width'], format['height'])
2653 else:
2654 res = '%sp' % format['height']
2655 elif format.get('width') is not None:
2656 res = '%dx?' % format['width']
2657 else:
2658 res = default
2659 return res
2660
2661 def _format_note(self, fdict):
2662 res = ''
2663 if fdict.get('ext') in ['f4f', 'f4m']:
2664 res += '(unsupported) '
2665 if fdict.get('language'):
2666 if res:
2667 res += ' '
2668 res += '[%s] ' % fdict['language']
2669 if fdict.get('format_note') is not None:
2670 res += fdict['format_note'] + ' '
2671 if fdict.get('tbr') is not None:
2672 res += '%4dk ' % fdict['tbr']
2673 if fdict.get('container') is not None:
2674 if res:
2675 res += ', '
2676 res += '%s container' % fdict['container']
2677 if (fdict.get('vcodec') is not None
2678 and fdict.get('vcodec') != 'none'):
2679 if res:
2680 res += ', '
2681 res += fdict['vcodec']
2682 if fdict.get('vbr') is not None:
2683 res += '@'
2684 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2685 res += 'video@'
2686 if fdict.get('vbr') is not None:
2687 res += '%4dk' % fdict['vbr']
2688 if fdict.get('fps') is not None:
2689 if res:
2690 res += ', '
2691 res += '%sfps' % fdict['fps']
2692 if fdict.get('acodec') is not None:
2693 if res:
2694 res += ', '
2695 if fdict['acodec'] == 'none':
2696 res += 'video only'
2697 else:
2698 res += '%-5s' % fdict['acodec']
2699 elif fdict.get('abr') is not None:
2700 if res:
2701 res += ', '
2702 res += 'audio'
2703 if fdict.get('abr') is not None:
2704 res += '@%3dk' % fdict['abr']
2705 if fdict.get('asr') is not None:
2706 res += ' (%5dHz)' % fdict['asr']
2707 if fdict.get('filesize') is not None:
2708 if res:
2709 res += ', '
2710 res += format_bytes(fdict['filesize'])
2711 elif fdict.get('filesize_approx') is not None:
2712 if res:
2713 res += ', '
2714 res += '~' + format_bytes(fdict['filesize_approx'])
2715 return res
2716
2717 def _format_note_table(self, f):
2718 def join_fields(*vargs):
2719 return ', '.join((val for val in vargs if val != ''))
2720
2721 return join_fields(
2722 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
2723 format_field(f, 'language', '[%s]'),
2724 format_field(f, 'format_note'),
2725 format_field(f, 'container', ignore=(None, f.get('ext'))),
2726 format_field(f, 'asr', '%5dHz'))
2727
2728 def list_formats(self, info_dict):
2729 formats = info_dict.get('formats', [info_dict])
2730 new_format = self.params.get('listformats_table', False)
2731 if new_format:
2732 table = [
2733 [
2734 format_field(f, 'format_id'),
2735 format_field(f, 'ext'),
2736 self.format_resolution(f),
2737 format_field(f, 'fps', '%d'),
2738 '|',
2739 format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
2740 format_field(f, 'tbr', '%4dk'),
2741 f.get('protocol').replace('http_dash_segments', 'dash').replace("native", "n").replace('niconico_', ''),
2742 '|',
2743 format_field(f, 'vcodec', default='unknown').replace('none', ''),
2744 format_field(f, 'vbr', '%4dk'),
2745 format_field(f, 'acodec', default='unknown').replace('none', ''),
2746 format_field(f, 'abr', '%3dk'),
2747 format_field(f, 'asr', '%5dHz'),
2748 self._format_note_table(f)]
2749 for f in formats
2750 if f.get('preference') is None or f['preference'] >= -1000]
2751 header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO',
2752 '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'NOTE']
2753 else:
2754 table = [
2755 [
2756 format_field(f, 'format_id'),
2757 format_field(f, 'ext'),
2758 self.format_resolution(f),
2759 self._format_note(f)]
2760 for f in formats
2761 if f.get('preference') is None or f['preference'] >= -1000]
2762 header_line = ['format code', 'extension', 'resolution', 'note']
2763
2764 self.to_screen(
2765 '[info] Available formats for %s:\n%s' % (info_dict['id'], render_table(
2766 header_line,
2767 table,
2768 delim=new_format,
2769 extraGap=(0 if new_format else 1),
2770 hideEmpty=new_format)))
2771
2772 def list_thumbnails(self, info_dict):
2773 thumbnails = info_dict.get('thumbnails')
2774 if not thumbnails:
2775 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2776 return
2777
2778 self.to_screen(
2779 '[info] Thumbnails for %s:' % info_dict['id'])
2780 self.to_screen(render_table(
2781 ['ID', 'width', 'height', 'URL'],
2782 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2783
2784 def list_subtitles(self, video_id, subtitles, name='subtitles'):
2785 if not subtitles:
2786 self.to_screen('%s has no %s' % (video_id, name))
2787 return
2788 self.to_screen(
2789 'Available %s for %s:' % (name, video_id))
2790 self.to_screen(render_table(
2791 ['Language', 'formats'],
2792 [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2793 for lang, formats in subtitles.items()]))
2794
2795 def urlopen(self, req):
2796 """ Start an HTTP download """
2797 if isinstance(req, compat_basestring):
2798 req = sanitized_Request(req)
2799 return self._opener.open(req, timeout=self._socket_timeout)
2800
2801 def print_debug_header(self):
2802 if not self.params.get('verbose'):
2803 return
2804
2805 if type('') is not compat_str:
2806 # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
2807 self.report_warning(
2808 'Your Python is broken! Update to a newer and supported version')
2809
2810 stdout_encoding = getattr(
2811 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2812 encoding_str = (
2813 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2814 locale.getpreferredencoding(),
2815 sys.getfilesystemencoding(),
2816 stdout_encoding,
2817 self.get_encoding()))
2818 write_string(encoding_str, encoding=None)
2819
2820 source = (
2821 '(exe)' if hasattr(sys, 'frozen')
2822 else '(zip)' if isinstance(globals().get('__loader__'), zipimporter)
2823 else '(source)' if os.path.basename(sys.argv[0]) == '__main__.py'
2824 else '')
2825 self._write_string('[debug] yt-dlp version %s %s\n' % (__version__, source))
2826 if _LAZY_LOADER:
2827 self._write_string('[debug] Lazy loading extractors enabled\n')
2828 if _PLUGIN_CLASSES:
2829 self._write_string(
2830 '[debug] Plugin Extractors: %s\n' % [ie.ie_key() for ie in _PLUGIN_CLASSES])
2831 try:
2832 sp = subprocess.Popen(
2833 ['git', 'rev-parse', '--short', 'HEAD'],
2834 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2835 cwd=os.path.dirname(os.path.abspath(__file__)))
2836 out, err = process_communicate_or_kill(sp)
2837 out = out.decode().strip()
2838 if re.match('[0-9a-f]+', out):
2839 self._write_string('[debug] Git HEAD: %s\n' % out)
2840 except Exception:
2841 try:
2842 sys.exc_clear()
2843 except Exception:
2844 pass
2845
2846 def python_implementation():
2847 impl_name = platform.python_implementation()
2848 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2849 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
2850 return impl_name
2851
2852 self._write_string('[debug] Python version %s (%s %s) - %s\n' % (
2853 platform.python_version(),
2854 python_implementation(),
2855 platform.architecture()[0],
2856 platform_name()))
2857
2858 exe_versions = FFmpegPostProcessor.get_versions(self)
2859 exe_versions['rtmpdump'] = rtmpdump_version()
2860 exe_versions['phantomjs'] = PhantomJSwrapper._version()
2861 exe_str = ', '.join(
2862 '%s %s' % (exe, v)
2863 for exe, v in sorted(exe_versions.items())
2864 if v
2865 )
2866 if not exe_str:
2867 exe_str = 'none'
2868 self._write_string('[debug] exe versions: %s\n' % exe_str)
2869
2870 proxy_map = {}
2871 for handler in self._opener.handlers:
2872 if hasattr(handler, 'proxies'):
2873 proxy_map.update(handler.proxies)
2874 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2875
2876 if self.params.get('call_home', False):
2877 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2878 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2879 return
2880 latest_version = self.urlopen(
2881 'https://yt-dl.org/latest/version').read().decode('utf-8')
2882 if version_tuple(latest_version) > version_tuple(__version__):
2883 self.report_warning(
2884 'You are using an outdated version (newest version: %s)! '
2885 'See https://yt-dl.org/update if you need help updating.' %
2886 latest_version)
2887
2888 def _setup_opener(self):
2889 timeout_val = self.params.get('socket_timeout')
2890 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2891
2892 opts_cookiefile = self.params.get('cookiefile')
2893 opts_proxy = self.params.get('proxy')
2894
2895 if opts_cookiefile is None:
2896 self.cookiejar = compat_cookiejar.CookieJar()
2897 else:
2898 opts_cookiefile = expand_path(opts_cookiefile)
2899 self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
2900 if os.access(opts_cookiefile, os.R_OK):
2901 self.cookiejar.load(ignore_discard=True, ignore_expires=True)
2902
2903 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2904 if opts_proxy is not None:
2905 if opts_proxy == '':
2906 proxies = {}
2907 else:
2908 proxies = {'http': opts_proxy, 'https': opts_proxy}
2909 else:
2910 proxies = compat_urllib_request.getproxies()
2911 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
2912 if 'http' in proxies and 'https' not in proxies:
2913 proxies['https'] = proxies['http']
2914 proxy_handler = PerRequestProxyHandler(proxies)
2915
2916 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2917 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2918 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2919 redirect_handler = YoutubeDLRedirectHandler()
2920 data_handler = compat_urllib_request_DataHandler()
2921
2922 # When passing our own FileHandler instance, build_opener won't add the
2923 # default FileHandler and allows us to disable the file protocol, which
2924 # can be used for malicious purposes (see
2925 # https://github.com/ytdl-org/youtube-dl/issues/8227)
2926 file_handler = compat_urllib_request.FileHandler()
2927
2928 def file_open(*args, **kwargs):
2929 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
2930 file_handler.file_open = file_open
2931
2932 opener = compat_urllib_request.build_opener(
2933 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
2934
2935 # Delete the default user-agent header, which would otherwise apply in
2936 # cases where our custom HTTP handler doesn't come into play
2937 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
2938 opener.addheaders = []
2939 self._opener = opener
2940
2941 def encode(self, s):
2942 if isinstance(s, bytes):
2943 return s # Already encoded
2944
2945 try:
2946 return s.encode(self.get_encoding())
2947 except UnicodeEncodeError as err:
2948 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2949 raise
2950
2951 def get_encoding(self):
2952 encoding = self.params.get('encoding')
2953 if encoding is None:
2954 encoding = preferredencoding()
2955 return encoding
2956
2957 def _write_thumbnails(self, info_dict, filename): # return the extensions
2958 write_all = self.params.get('write_all_thumbnails', False)
2959 thumbnails = []
2960 if write_all or self.params.get('writethumbnail', False):
2961 thumbnails = info_dict.get('thumbnails') or []
2962 multiple = write_all and len(thumbnails) > 1
2963
2964 ret = []
2965 for t in thumbnails[::1 if write_all else -1]:
2966 thumb_ext = determine_ext(t['url'], 'jpg')
2967 suffix = '%s.' % t['id'] if multiple else ''
2968 thumb_display_id = '%s ' % t['id'] if multiple else ''
2969 t['filepath'] = thumb_filename = replace_extension(filename, suffix + thumb_ext, info_dict.get('ext'))
2970
2971 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(thumb_filename)):
2972 ret.append(suffix + thumb_ext)
2973 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2974 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2975 else:
2976 self.to_screen('[%s] %s: Downloading thumbnail %s ...' %
2977 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2978 try:
2979 uf = self.urlopen(t['url'])
2980 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2981 shutil.copyfileobj(uf, thumbf)
2982 ret.append(suffix + thumb_ext)
2983 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2984 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2985 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2986 self.report_warning('Unable to download thumbnail "%s": %s' %
2987 (t['url'], error_to_compat_str(err)))
2988 if ret and not write_all:
2989 break
2990 return ret