]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dlc/YoutubeDL.py
Better Format Selection
[yt-dlp.git] / youtube_dlc / YoutubeDL.py
... / ...
CommitLineData
1#!/usr/bin/env python
2# coding: utf-8
3
4from __future__ import absolute_import, unicode_literals
5
6import collections
7import contextlib
8import copy
9import datetime
10import errno
11import fileinput
12import io
13import itertools
14import json
15import locale
16import operator
17import os
18import platform
19import re
20import shutil
21import subprocess
22import socket
23import sys
24import time
25import tokenize
26import traceback
27import random
28
29from string import ascii_letters
30
31from .compat import (
32 compat_basestring,
33 compat_cookiejar,
34 compat_get_terminal_size,
35 compat_http_client,
36 compat_kwargs,
37 compat_numeric_types,
38 compat_os_name,
39 compat_str,
40 compat_tokenize_tokenize,
41 compat_urllib_error,
42 compat_urllib_request,
43 compat_urllib_request_DataHandler,
44)
45from .utils import (
46 age_restricted,
47 args_to_str,
48 ContentTooShortError,
49 date_from_str,
50 DateRange,
51 DEFAULT_OUTTMPL,
52 determine_ext,
53 determine_protocol,
54 DownloadError,
55 encode_compat_str,
56 encodeFilename,
57 error_to_compat_str,
58 expand_path,
59 ExtractorError,
60 format_bytes,
61 formatSeconds,
62 GeoRestrictedError,
63 int_or_none,
64 ISO3166Utils,
65 locked_file,
66 make_HTTPS_handler,
67 MaxDownloadsReached,
68 orderedSet,
69 PagedList,
70 parse_filesize,
71 PerRequestProxyHandler,
72 platform_name,
73 PostProcessingError,
74 preferredencoding,
75 prepend_extension,
76 register_socks_protocols,
77 render_table,
78 replace_extension,
79 SameFileError,
80 sanitize_filename,
81 sanitize_path,
82 sanitize_url,
83 sanitized_Request,
84 std_headers,
85 str_or_none,
86 subtitles_filename,
87 UnavailableVideoError,
88 url_basename,
89 version_tuple,
90 write_json_file,
91 write_string,
92 YoutubeDLCookieJar,
93 YoutubeDLCookieProcessor,
94 YoutubeDLHandler,
95 YoutubeDLRedirectHandler,
96)
97from .cache import Cache
98from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
99from .extractor.openload import PhantomJSwrapper
100from .downloader import get_suitable_downloader
101from .downloader.rtmp import rtmpdump_version
102from .postprocessor import (
103 FFmpegFixupM3u8PP,
104 FFmpegFixupM4aPP,
105 FFmpegFixupStretchedPP,
106 FFmpegMergerPP,
107 FFmpegPostProcessor,
108 FFmpegSubtitlesConvertorPP,
109 get_postprocessor,
110)
111from .version import __version__
112
113if compat_os_name == 'nt':
114 import ctypes
115
116
117class YoutubeDL(object):
118 """YoutubeDL class.
119
120 YoutubeDL objects are the ones responsible of downloading the
121 actual video file and writing it to disk if the user has requested
122 it, among some other tasks. In most cases there should be one per
123 program. As, given a video URL, the downloader doesn't know how to
124 extract all the needed information, task that InfoExtractors do, it
125 has to pass the URL to one of them.
126
127 For this, YoutubeDL objects have a method that allows
128 InfoExtractors to be registered in a given order. When it is passed
129 a URL, the YoutubeDL object handles it to the first InfoExtractor it
130 finds that reports being able to handle it. The InfoExtractor extracts
131 all the information about the video or videos the URL refers to, and
132 YoutubeDL process the extracted information, possibly using a File
133 Downloader to download the video.
134
135 YoutubeDL objects accept a lot of parameters. In order not to saturate
136 the object constructor with arguments, it receives a dictionary of
137 options instead. These options are available through the params
138 attribute for the InfoExtractors to use. The YoutubeDL also
139 registers itself as the downloader in charge for the InfoExtractors
140 that are added to it, so this is a "mutual registration".
141
142 Available options:
143
144 username: Username for authentication purposes.
145 password: Password for authentication purposes.
146 videopassword: Password for accessing a video.
147 ap_mso: Adobe Pass multiple-system operator identifier.
148 ap_username: Multiple-system operator account username.
149 ap_password: Multiple-system operator account password.
150 usenetrc: Use netrc for authentication instead.
151 verbose: Print additional info to stdout.
152 quiet: Do not print messages to stdout.
153 no_warnings: Do not print out anything for warnings.
154 forceurl: Force printing final URL.
155 forcetitle: Force printing title.
156 forceid: Force printing ID.
157 forcethumbnail: Force printing thumbnail URL.
158 forcedescription: Force printing description.
159 forcefilename: Force printing final filename.
160 forceduration: Force printing duration.
161 forcejson: Force printing info_dict as JSON.
162 dump_single_json: Force printing the info_dict of the whole playlist
163 (or video) as a single JSON line.
164 simulate: Do not download the video files.
165 format: Video format code. see "FORMAT SELECTION" for more details.
166 format_sort: How to sort the video formats. see "Sorting Formats" for more details.
167 format_sort_force: Force the given format_sort. see "Sorting Formats" for more details.
168 allow_multiple_video_streams: Allow multiple video streams to be merged into a single file
169 allow_multiple_audio_streams: Allow multiple audio streams to be merged into a single file
170 outtmpl: Template for output names.
171 restrictfilenames: Do not allow "&" and spaces in file names.
172 trim_file_name: Limit length of filename (extension excluded).
173 ignoreerrors: Do not stop on download errors.
174 force_generic_extractor: Force downloader to use the generic extractor
175 nooverwrites: Prevent overwriting files.
176 playliststart: Playlist item to start at.
177 playlistend: Playlist item to end at.
178 playlist_items: Specific indices of playlist to download.
179 playlistreverse: Download playlist items in reverse order.
180 playlistrandom: Download playlist items in random order.
181 matchtitle: Download only matching titles.
182 rejecttitle: Reject downloads for matching titles.
183 logger: Log messages to a logging.Logger instance.
184 logtostderr: Log messages to stderr instead of stdout.
185 writedescription: Write the video description to a .description file
186 writeinfojson: Write the video description to a .info.json file
187 writeannotations: Write the video annotations to a .annotations.xml file
188 writethumbnail: Write the thumbnail image to a file
189 write_all_thumbnails: Write all thumbnail formats to files
190 writesubtitles: Write the video subtitles to a file
191 writeautomaticsub: Write the automatically generated subtitles to a file
192 allsubtitles: Downloads all the subtitles of the video
193 (requires writesubtitles or writeautomaticsub)
194 listsubtitles: Lists all available subtitles for the video
195 subtitlesformat: The format code for subtitles
196 subtitleslangs: List of languages of the subtitles to download
197 keepvideo: Keep the video file after post-processing
198 daterange: A DateRange object, download only if the upload_date is in the range.
199 skip_download: Skip the actual download of the video file
200 cachedir: Location of the cache files in the filesystem.
201 False to disable filesystem cache.
202 noplaylist: Download single video instead of a playlist if in doubt.
203 age_limit: An integer representing the user's age in years.
204 Unsuitable videos for the given age are skipped.
205 min_views: An integer representing the minimum view count the video
206 must have in order to not be skipped.
207 Videos without view count information are always
208 downloaded. None for no limit.
209 max_views: An integer representing the maximum view count.
210 Videos that are more popular than that are not
211 downloaded.
212 Videos without view count information are always
213 downloaded. None for no limit.
214 download_archive: File name of a file where all downloads are recorded.
215 Videos already present in the file are not downloaded
216 again.
217 break_on_existing: Stop the download process after attempting to download a file that's
218 in the archive.
219 cookiefile: File name where cookies should be read from and dumped to.
220 nocheckcertificate:Do not verify SSL certificates
221 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
222 At the moment, this is only supported by YouTube.
223 proxy: URL of the proxy server to use
224 geo_verification_proxy: URL of the proxy to use for IP address verification
225 on geo-restricted sites.
226 socket_timeout: Time to wait for unresponsive hosts, in seconds
227 bidi_workaround: Work around buggy terminals without bidirectional text
228 support, using fridibi
229 debug_printtraffic:Print out sent and received HTTP traffic
230 include_ads: Download ads as well
231 default_search: Prepend this string if an input url is not valid.
232 'auto' for elaborate guessing
233 encoding: Use this encoding instead of the system-specified.
234 extract_flat: Do not resolve URLs, return the immediate result.
235 Pass in 'in_playlist' to only show this behavior for
236 playlist items.
237 postprocessors: A list of dictionaries, each with an entry
238 * key: The name of the postprocessor. See
239 youtube_dlc/postprocessor/__init__.py for a list.
240 as well as any further keyword arguments for the
241 postprocessor.
242 progress_hooks: A list of functions that get called on download
243 progress, with a dictionary with the entries
244 * status: One of "downloading", "error", or "finished".
245 Check this first and ignore unknown values.
246
247 If status is one of "downloading", or "finished", the
248 following properties may also be present:
249 * filename: The final filename (always present)
250 * tmpfilename: The filename we're currently writing to
251 * downloaded_bytes: Bytes on disk
252 * total_bytes: Size of the whole file, None if unknown
253 * total_bytes_estimate: Guess of the eventual file size,
254 None if unavailable.
255 * elapsed: The number of seconds since download started.
256 * eta: The estimated time in seconds, None if unknown
257 * speed: The download speed in bytes/second, None if
258 unknown
259 * fragment_index: The counter of the currently
260 downloaded video fragment.
261 * fragment_count: The number of fragments (= individual
262 files that will be merged)
263
264 Progress hooks are guaranteed to be called at least once
265 (with status "finished") if the download is successful.
266 merge_output_format: Extension to use when merging formats.
267 fixup: Automatically correct known faults of the file.
268 One of:
269 - "never": do nothing
270 - "warn": only emit a warning
271 - "detect_or_warn": check whether we can do anything
272 about it, warn otherwise (default)
273 source_address: Client-side IP address to bind to.
274 call_home: Boolean, true iff we are allowed to contact the
275 youtube-dlc servers for debugging.
276 sleep_interval: Number of seconds to sleep before each download when
277 used alone or a lower bound of a range for randomized
278 sleep before each download (minimum possible number
279 of seconds to sleep) when used along with
280 max_sleep_interval.
281 max_sleep_interval:Upper bound of a range for randomized sleep before each
282 download (maximum possible number of seconds to sleep).
283 Must only be used along with sleep_interval.
284 Actual sleep time will be a random float from range
285 [sleep_interval; max_sleep_interval].
286 listformats: Print an overview of available video formats and exit.
287 list_thumbnails: Print a table of all thumbnails and exit.
288 match_filter: A function that gets called with the info_dict of
289 every video.
290 If it returns a message, the video is ignored.
291 If it returns None, the video is downloaded.
292 match_filter_func in utils.py is one example for this.
293 no_color: Do not emit color codes in output.
294 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
295 HTTP header
296 geo_bypass_country:
297 Two-letter ISO 3166-2 country code that will be used for
298 explicit geographic restriction bypassing via faking
299 X-Forwarded-For HTTP header
300 geo_bypass_ip_block:
301 IP range in CIDR notation that will be used similarly to
302 geo_bypass_country
303
304 The following options determine which downloader is picked:
305 external_downloader: Executable of the external downloader to call.
306 None or unset for standard (built-in) downloader.
307 hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
308 if True, otherwise use ffmpeg/avconv if False, otherwise
309 use downloader suggested by extractor if None.
310
311 The following parameters are not used by YoutubeDL itself, they are used by
312 the downloader (see youtube_dlc/downloader/common.py):
313 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
314 noresizebuffer, retries, continuedl, noprogress, consoletitle,
315 xattr_set_filesize, external_downloader_args, hls_use_mpegts,
316 http_chunk_size.
317
318 The following options are used by the post processors:
319 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
320 otherwise prefer ffmpeg.
321 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
322 to the binary or its containing directory.
323 postprocessor_args: A list of additional command-line arguments for the
324 postprocessor.
325
326 The following options are used by the Youtube extractor:
327 youtube_include_dash_manifest: If True (default), DASH manifests and related
328 data will be downloaded and processed by extractor.
329 You can reduce network I/O by disabling it if you don't
330 care about DASH.
331 """
332
333 _NUMERIC_FIELDS = set((
334 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
335 'timestamp', 'upload_year', 'upload_month', 'upload_day',
336 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
337 'average_rating', 'comment_count', 'age_limit',
338 'start_time', 'end_time',
339 'chapter_number', 'season_number', 'episode_number',
340 'track_number', 'disc_number', 'release_year',
341 'playlist_index',
342 ))
343
344 params = None
345 _ies = []
346 _pps = []
347 _download_retcode = None
348 _num_downloads = None
349 _screen_file = None
350
351 def __init__(self, params=None, auto_init=True):
352 """Create a FileDownloader object with the given options."""
353 if params is None:
354 params = {}
355 self._ies = []
356 self._ies_instances = {}
357 self._pps = []
358 self._progress_hooks = []
359 self._download_retcode = 0
360 self._num_downloads = 0
361 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
362 self._err_file = sys.stderr
363 self.params = {
364 # Default parameters
365 'nocheckcertificate': False,
366 }
367 self.params.update(params)
368 self.cache = Cache(self)
369 self.archive = set()
370
371 """Preload the archive, if any is specified"""
372 def preload_download_archive(self):
373 fn = self.params.get('download_archive')
374 if fn is None:
375 return False
376 try:
377 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
378 for line in archive_file:
379 self.archive.add(line.strip())
380 except IOError as ioe:
381 if ioe.errno != errno.ENOENT:
382 raise
383 return False
384 return True
385
386 def check_deprecated(param, option, suggestion):
387 if self.params.get(param) is not None:
388 self.report_warning(
389 '%s is deprecated. Use %s instead.' % (option, suggestion))
390 return True
391 return False
392
393 if self.params.get('verbose'):
394 self.to_stdout('[debug] Loading archive file %r' % self.params.get('download_archive'))
395
396 preload_download_archive(self)
397
398 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
399 if self.params.get('geo_verification_proxy') is None:
400 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
401
402 check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
403 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
404 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
405
406 if params.get('bidi_workaround', False):
407 try:
408 import pty
409 master, slave = pty.openpty()
410 width = compat_get_terminal_size().columns
411 if width is None:
412 width_args = []
413 else:
414 width_args = ['-w', str(width)]
415 sp_kwargs = dict(
416 stdin=subprocess.PIPE,
417 stdout=slave,
418 stderr=self._err_file)
419 try:
420 self._output_process = subprocess.Popen(
421 ['bidiv'] + width_args, **sp_kwargs
422 )
423 except OSError:
424 self._output_process = subprocess.Popen(
425 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
426 self._output_channel = os.fdopen(master, 'rb')
427 except OSError as ose:
428 if ose.errno == errno.ENOENT:
429 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
430 else:
431 raise
432
433 if (sys.platform != 'win32'
434 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
435 and not params.get('restrictfilenames', False)):
436 # Unicode filesystem API will throw errors (#1474, #13027)
437 self.report_warning(
438 'Assuming --restrict-filenames since file system encoding '
439 'cannot encode all characters. '
440 'Set the LC_ALL environment variable to fix this.')
441 self.params['restrictfilenames'] = True
442
443 if isinstance(params.get('outtmpl'), bytes):
444 self.report_warning(
445 'Parameter outtmpl is bytes, but should be a unicode string. '
446 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
447
448 self._setup_opener()
449
450 if auto_init:
451 self.print_debug_header()
452 self.add_default_info_extractors()
453
454 for pp_def_raw in self.params.get('postprocessors', []):
455 pp_class = get_postprocessor(pp_def_raw['key'])
456 pp_def = dict(pp_def_raw)
457 del pp_def['key']
458 pp = pp_class(self, **compat_kwargs(pp_def))
459 self.add_post_processor(pp)
460
461 for ph in self.params.get('progress_hooks', []):
462 self.add_progress_hook(ph)
463
464 register_socks_protocols()
465
466 def warn_if_short_id(self, argv):
467 # short YouTube ID starting with dash?
468 idxs = [
469 i for i, a in enumerate(argv)
470 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
471 if idxs:
472 correct_argv = (
473 ['youtube-dlc']
474 + [a for i, a in enumerate(argv) if i not in idxs]
475 + ['--'] + [argv[i] for i in idxs]
476 )
477 self.report_warning(
478 'Long argument string detected. '
479 'Use -- to separate parameters and URLs, like this:\n%s\n' %
480 args_to_str(correct_argv))
481
482 def add_info_extractor(self, ie):
483 """Add an InfoExtractor object to the end of the list."""
484 self._ies.append(ie)
485 if not isinstance(ie, type):
486 self._ies_instances[ie.ie_key()] = ie
487 ie.set_downloader(self)
488
489 def get_info_extractor(self, ie_key):
490 """
491 Get an instance of an IE with name ie_key, it will try to get one from
492 the _ies list, if there's no instance it will create a new one and add
493 it to the extractor list.
494 """
495 ie = self._ies_instances.get(ie_key)
496 if ie is None:
497 ie = get_info_extractor(ie_key)()
498 self.add_info_extractor(ie)
499 return ie
500
501 def add_default_info_extractors(self):
502 """
503 Add the InfoExtractors returned by gen_extractors to the end of the list
504 """
505 for ie in gen_extractor_classes():
506 self.add_info_extractor(ie)
507
508 def add_post_processor(self, pp):
509 """Add a PostProcessor object to the end of the chain."""
510 self._pps.append(pp)
511 pp.set_downloader(self)
512
513 def add_progress_hook(self, ph):
514 """Add the progress hook (currently only for the file downloader)"""
515 self._progress_hooks.append(ph)
516
517 def _bidi_workaround(self, message):
518 if not hasattr(self, '_output_channel'):
519 return message
520
521 assert hasattr(self, '_output_process')
522 assert isinstance(message, compat_str)
523 line_count = message.count('\n') + 1
524 self._output_process.stdin.write((message + '\n').encode('utf-8'))
525 self._output_process.stdin.flush()
526 res = ''.join(self._output_channel.readline().decode('utf-8')
527 for _ in range(line_count))
528 return res[:-len('\n')]
529
530 def to_screen(self, message, skip_eol=False):
531 """Print message to stdout if not in quiet mode."""
532 return self.to_stdout(message, skip_eol, check_quiet=True)
533
534 def _write_string(self, s, out=None):
535 write_string(s, out=out, encoding=self.params.get('encoding'))
536
537 def to_stdout(self, message, skip_eol=False, check_quiet=False):
538 """Print message to stdout if not in quiet mode."""
539 if self.params.get('logger'):
540 self.params['logger'].debug(message)
541 elif not check_quiet or not self.params.get('quiet', False):
542 message = self._bidi_workaround(message)
543 terminator = ['\n', ''][skip_eol]
544 output = message + terminator
545
546 self._write_string(output, self._screen_file)
547
548 def to_stderr(self, message):
549 """Print message to stderr."""
550 assert isinstance(message, compat_str)
551 if self.params.get('logger'):
552 self.params['logger'].error(message)
553 else:
554 message = self._bidi_workaround(message)
555 output = message + '\n'
556 self._write_string(output, self._err_file)
557
558 def to_console_title(self, message):
559 if not self.params.get('consoletitle', False):
560 return
561 if compat_os_name == 'nt':
562 if ctypes.windll.kernel32.GetConsoleWindow():
563 # c_wchar_p() might not be necessary if `message` is
564 # already of type unicode()
565 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
566 elif 'TERM' in os.environ:
567 self._write_string('\033]0;%s\007' % message, self._screen_file)
568
569 def save_console_title(self):
570 if not self.params.get('consoletitle', False):
571 return
572 if self.params.get('simulate', False):
573 return
574 if compat_os_name != 'nt' and 'TERM' in os.environ:
575 # Save the title on stack
576 self._write_string('\033[22;0t', self._screen_file)
577
578 def restore_console_title(self):
579 if not self.params.get('consoletitle', False):
580 return
581 if self.params.get('simulate', False):
582 return
583 if compat_os_name != 'nt' and 'TERM' in os.environ:
584 # Restore the title from stack
585 self._write_string('\033[23;0t', self._screen_file)
586
587 def __enter__(self):
588 self.save_console_title()
589 return self
590
591 def __exit__(self, *args):
592 self.restore_console_title()
593
594 if self.params.get('cookiefile') is not None:
595 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
596
597 def trouble(self, message=None, tb=None):
598 """Determine action to take when a download problem appears.
599
600 Depending on if the downloader has been configured to ignore
601 download errors or not, this method may throw an exception or
602 not when errors are found, after printing the message.
603
604 tb, if given, is additional traceback information.
605 """
606 if message is not None:
607 self.to_stderr(message)
608 if self.params.get('verbose'):
609 if tb is None:
610 if sys.exc_info()[0]: # if .trouble has been called from an except block
611 tb = ''
612 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
613 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
614 tb += encode_compat_str(traceback.format_exc())
615 else:
616 tb_data = traceback.format_list(traceback.extract_stack())
617 tb = ''.join(tb_data)
618 self.to_stderr(tb)
619 if not self.params.get('ignoreerrors', False):
620 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
621 exc_info = sys.exc_info()[1].exc_info
622 else:
623 exc_info = sys.exc_info()
624 raise DownloadError(message, exc_info)
625 self._download_retcode = 1
626
627 def report_warning(self, message):
628 '''
629 Print the message to stderr, it will be prefixed with 'WARNING:'
630 If stderr is a tty file the 'WARNING:' will be colored
631 '''
632 if self.params.get('logger') is not None:
633 self.params['logger'].warning(message)
634 else:
635 if self.params.get('no_warnings'):
636 return
637 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
638 _msg_header = '\033[0;33mWARNING:\033[0m'
639 else:
640 _msg_header = 'WARNING:'
641 warning_message = '%s %s' % (_msg_header, message)
642 self.to_stderr(warning_message)
643
644 def report_error(self, message, tb=None):
645 '''
646 Do the same as trouble, but prefixes the message with 'ERROR:', colored
647 in red if stderr is a tty file.
648 '''
649 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
650 _msg_header = '\033[0;31mERROR:\033[0m'
651 else:
652 _msg_header = 'ERROR:'
653 error_message = '%s %s' % (_msg_header, message)
654 self.trouble(error_message, tb)
655
656 def report_file_already_downloaded(self, file_name):
657 """Report file has already been fully downloaded."""
658 try:
659 self.to_screen('[download] %s has already been downloaded' % file_name)
660 except UnicodeEncodeError:
661 self.to_screen('[download] The file has already been downloaded')
662
663 def prepare_filename(self, info_dict):
664 """Generate the output filename."""
665 try:
666 template_dict = dict(info_dict)
667
668 template_dict['epoch'] = int(time.time())
669 autonumber_size = self.params.get('autonumber_size')
670 if autonumber_size is None:
671 autonumber_size = 5
672 template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
673 if template_dict.get('resolution') is None:
674 if template_dict.get('width') and template_dict.get('height'):
675 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
676 elif template_dict.get('height'):
677 template_dict['resolution'] = '%sp' % template_dict['height']
678 elif template_dict.get('width'):
679 template_dict['resolution'] = '%dx?' % template_dict['width']
680
681 sanitize = lambda k, v: sanitize_filename(
682 compat_str(v),
683 restricted=self.params.get('restrictfilenames'),
684 is_id=(k == 'id' or k.endswith('_id')))
685 template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
686 for k, v in template_dict.items()
687 if v is not None and not isinstance(v, (list, tuple, dict)))
688 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
689
690 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
691
692 # For fields playlist_index and autonumber convert all occurrences
693 # of %(field)s to %(field)0Nd for backward compatibility
694 field_size_compat_map = {
695 'playlist_index': len(str(template_dict['n_entries'])),
696 'autonumber': autonumber_size,
697 }
698 FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
699 mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
700 if mobj:
701 outtmpl = re.sub(
702 FIELD_SIZE_COMPAT_RE,
703 r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
704 outtmpl)
705
706 # Missing numeric fields used together with integer presentation types
707 # in format specification will break the argument substitution since
708 # string 'NA' is returned for missing fields. We will patch output
709 # template for missing fields to meet string presentation type.
710 for numeric_field in self._NUMERIC_FIELDS:
711 if numeric_field not in template_dict:
712 # As of [1] format syntax is:
713 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
714 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
715 FORMAT_RE = r'''(?x)
716 (?<!%)
717 %
718 \({0}\) # mapping key
719 (?:[#0\-+ ]+)? # conversion flags (optional)
720 (?:\d+)? # minimum field width (optional)
721 (?:\.\d+)? # precision (optional)
722 [hlL]? # length modifier (optional)
723 [diouxXeEfFgGcrs%] # conversion type
724 '''
725 outtmpl = re.sub(
726 FORMAT_RE.format(numeric_field),
727 r'%({0})s'.format(numeric_field), outtmpl)
728
729 # expand_path translates '%%' into '%' and '$$' into '$'
730 # correspondingly that is not what we want since we need to keep
731 # '%%' intact for template dict substitution step. Working around
732 # with boundary-alike separator hack.
733 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
734 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
735
736 # outtmpl should be expand_path'ed before template dict substitution
737 # because meta fields may contain env variables we don't want to
738 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
739 # title "Hello $PATH", we don't want `$PATH` to be expanded.
740 filename = expand_path(outtmpl).replace(sep, '') % template_dict
741
742 # https://github.com/blackjack4494/youtube-dlc/issues/85
743 trim_file_name = self.params.get('trim_file_name', False)
744 if trim_file_name:
745 fn_groups = filename.rsplit('.')
746 ext = fn_groups[-1]
747 sub_ext = ''
748 if len(fn_groups) > 2:
749 sub_ext = fn_groups[-2]
750 filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
751
752 # Temporary fix for #4787
753 # 'Treat' all problem characters by passing filename through preferredencoding
754 # to workaround encoding issues with subprocess on python2 @ Windows
755 if sys.version_info < (3, 0) and sys.platform == 'win32':
756 filename = encodeFilename(filename, True).decode(preferredencoding())
757 return sanitize_path(filename)
758 except ValueError as err:
759 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
760 return None
761
762 def _match_entry(self, info_dict, incomplete):
763 """ Returns None if the file should be downloaded """
764
765 video_title = info_dict.get('title', info_dict.get('id', 'video'))
766 if 'title' in info_dict:
767 # This can happen when we're just evaluating the playlist
768 title = info_dict['title']
769 matchtitle = self.params.get('matchtitle', False)
770 if matchtitle:
771 if not re.search(matchtitle, title, re.IGNORECASE):
772 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
773 rejecttitle = self.params.get('rejecttitle', False)
774 if rejecttitle:
775 if re.search(rejecttitle, title, re.IGNORECASE):
776 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
777 date = info_dict.get('upload_date')
778 if date is not None:
779 dateRange = self.params.get('daterange', DateRange())
780 if date not in dateRange:
781 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
782 view_count = info_dict.get('view_count')
783 if view_count is not None:
784 min_views = self.params.get('min_views')
785 if min_views is not None and view_count < min_views:
786 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
787 max_views = self.params.get('max_views')
788 if max_views is not None and view_count > max_views:
789 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
790 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
791 return 'Skipping "%s" because it is age restricted' % video_title
792 if self.in_download_archive(info_dict):
793 return '%s has already been recorded in archive' % video_title
794
795 if not incomplete:
796 match_filter = self.params.get('match_filter')
797 if match_filter is not None:
798 ret = match_filter(info_dict)
799 if ret is not None:
800 return ret
801
802 return None
803
804 @staticmethod
805 def add_extra_info(info_dict, extra_info):
806 '''Set the keys from extra_info in info dict if they are missing'''
807 for key, value in extra_info.items():
808 info_dict.setdefault(key, value)
809
810 def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_info={},
811 process=True, force_generic_extractor=False):
812 '''
813 Returns a list with a dictionary for each video we find.
814 If 'download', also downloads the videos.
815 extra_info is a dict containing the extra values to add to each result
816 '''
817
818 if not ie_key and force_generic_extractor:
819 ie_key = 'Generic'
820
821 if ie_key:
822 ies = [self.get_info_extractor(ie_key)]
823 else:
824 ies = self._ies
825
826 for ie in ies:
827 if not ie.suitable(url):
828 continue
829
830 ie_key = ie.ie_key()
831 ie = self.get_info_extractor(ie_key)
832 if not ie.working():
833 self.report_warning('The program functionality for this site has been marked as broken, '
834 'and will probably not work.')
835
836 try:
837 temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url)
838 except (AssertionError, IndexError, AttributeError):
839 temp_id = None
840 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
841 self.to_screen("[%s] %s: has already been recorded in archive" % (
842 ie_key, temp_id))
843 break
844
845 return self.__extract_info(url, ie, download, extra_info, process, info_dict)
846
847 else:
848 self.report_error('no suitable InfoExtractor for URL %s' % url)
849
850 def __handle_extraction_exceptions(func):
851 def wrapper(self, *args, **kwargs):
852 try:
853 return func(self, *args, **kwargs)
854 except GeoRestrictedError as e:
855 msg = e.msg
856 if e.countries:
857 msg += '\nThis video is available in %s.' % ', '.join(
858 map(ISO3166Utils.short2full, e.countries))
859 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
860 self.report_error(msg)
861 except ExtractorError as e: # An error we somewhat expected
862 self.report_error(compat_str(e), e.format_traceback())
863 except MaxDownloadsReached:
864 raise
865 except Exception as e:
866 if self.params.get('ignoreerrors', False):
867 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
868 else:
869 raise
870 return wrapper
871
872 @__handle_extraction_exceptions
873 def __extract_info(self, url, ie, download, extra_info, process, info_dict):
874 ie_result = ie.extract(url)
875 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
876 return
877 if isinstance(ie_result, list):
878 # Backwards compatibility: old IE result format
879 ie_result = {
880 '_type': 'compat_list',
881 'entries': ie_result,
882 }
883 if info_dict:
884 if info_dict.get('id'):
885 ie_result['id'] = info_dict['id']
886 if info_dict.get('title'):
887 ie_result['title'] = info_dict['title']
888 self.add_default_extra_info(ie_result, ie, url)
889 if process:
890 return self.process_ie_result(ie_result, download, extra_info)
891 else:
892 return ie_result
893
894 def add_default_extra_info(self, ie_result, ie, url):
895 self.add_extra_info(ie_result, {
896 'extractor': ie.IE_NAME,
897 'webpage_url': url,
898 'webpage_url_basename': url_basename(url),
899 'extractor_key': ie.ie_key(),
900 })
901
902 def process_ie_result(self, ie_result, download=True, extra_info={}):
903 """
904 Take the result of the ie(may be modified) and resolve all unresolved
905 references (URLs, playlist items).
906
907 It will also download the videos if 'download'.
908 Returns the resolved ie_result.
909 """
910 result_type = ie_result.get('_type', 'video')
911
912 if result_type in ('url', 'url_transparent'):
913 ie_result['url'] = sanitize_url(ie_result['url'])
914 extract_flat = self.params.get('extract_flat', False)
915 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
916 or extract_flat is True):
917 self.__forced_printings(
918 ie_result, self.prepare_filename(ie_result),
919 incomplete=True)
920 return ie_result
921
922 if result_type == 'video':
923 self.add_extra_info(ie_result, extra_info)
924 return self.process_video_result(ie_result, download=download)
925 elif result_type == 'url':
926 # We have to add extra_info to the results because it may be
927 # contained in a playlist
928 return self.extract_info(ie_result['url'],
929 download, info_dict=ie_result,
930 ie_key=ie_result.get('ie_key'),
931 extra_info=extra_info)
932 elif result_type == 'url_transparent':
933 # Use the information from the embedding page
934 info = self.extract_info(
935 ie_result['url'], ie_key=ie_result.get('ie_key'),
936 extra_info=extra_info, download=False, process=False)
937
938 # extract_info may return None when ignoreerrors is enabled and
939 # extraction failed with an error, don't crash and return early
940 # in this case
941 if not info:
942 return info
943
944 force_properties = dict(
945 (k, v) for k, v in ie_result.items() if v is not None)
946 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
947 if f in force_properties:
948 del force_properties[f]
949 new_result = info.copy()
950 new_result.update(force_properties)
951
952 # Extracted info may not be a video result (i.e.
953 # info.get('_type', 'video') != video) but rather an url or
954 # url_transparent. In such cases outer metadata (from ie_result)
955 # should be propagated to inner one (info). For this to happen
956 # _type of info should be overridden with url_transparent. This
957 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
958 if new_result.get('_type') == 'url':
959 new_result['_type'] = 'url_transparent'
960
961 return self.process_ie_result(
962 new_result, download=download, extra_info=extra_info)
963 elif result_type in ('playlist', 'multi_video'):
964 # We process each entry in the playlist
965 playlist = ie_result.get('title') or ie_result.get('id')
966 self.to_screen('[download] Downloading playlist: %s' % playlist)
967
968 playlist_results = []
969
970 playliststart = self.params.get('playliststart', 1) - 1
971 playlistend = self.params.get('playlistend')
972 # For backwards compatibility, interpret -1 as whole list
973 if playlistend == -1:
974 playlistend = None
975
976 playlistitems_str = self.params.get('playlist_items')
977 playlistitems = None
978 if playlistitems_str is not None:
979 def iter_playlistitems(format):
980 for string_segment in format.split(','):
981 if '-' in string_segment:
982 start, end = string_segment.split('-')
983 for item in range(int(start), int(end) + 1):
984 yield int(item)
985 else:
986 yield int(string_segment)
987 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
988
989 ie_entries = ie_result['entries']
990
991 def make_playlistitems_entries(list_ie_entries):
992 num_entries = len(list_ie_entries)
993 return [
994 list_ie_entries[i - 1] for i in playlistitems
995 if -num_entries <= i - 1 < num_entries]
996
997 def report_download(num_entries):
998 self.to_screen(
999 '[%s] playlist %s: Downloading %d videos' %
1000 (ie_result['extractor'], playlist, num_entries))
1001
1002 if isinstance(ie_entries, list):
1003 n_all_entries = len(ie_entries)
1004 if playlistitems:
1005 entries = make_playlistitems_entries(ie_entries)
1006 else:
1007 entries = ie_entries[playliststart:playlistend]
1008 n_entries = len(entries)
1009 self.to_screen(
1010 '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
1011 (ie_result['extractor'], playlist, n_all_entries, n_entries))
1012 elif isinstance(ie_entries, PagedList):
1013 if playlistitems:
1014 entries = []
1015 for item in playlistitems:
1016 entries.extend(ie_entries.getslice(
1017 item - 1, item
1018 ))
1019 else:
1020 entries = ie_entries.getslice(
1021 playliststart, playlistend)
1022 n_entries = len(entries)
1023 report_download(n_entries)
1024 else: # iterable
1025 if playlistitems:
1026 entries = make_playlistitems_entries(list(itertools.islice(
1027 ie_entries, 0, max(playlistitems))))
1028 else:
1029 entries = list(itertools.islice(
1030 ie_entries, playliststart, playlistend))
1031 n_entries = len(entries)
1032 report_download(n_entries)
1033
1034 if self.params.get('playlistreverse', False):
1035 entries = entries[::-1]
1036
1037 if self.params.get('playlistrandom', False):
1038 random.shuffle(entries)
1039
1040 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1041
1042 for i, entry in enumerate(entries, 1):
1043 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1044 # This __x_forwarded_for_ip thing is a bit ugly but requires
1045 # minimal changes
1046 if x_forwarded_for:
1047 entry['__x_forwarded_for_ip'] = x_forwarded_for
1048 extra = {
1049 'n_entries': n_entries,
1050 'playlist': playlist,
1051 'playlist_id': ie_result.get('id'),
1052 'playlist_title': ie_result.get('title'),
1053 'playlist_uploader': ie_result.get('uploader'),
1054 'playlist_uploader_id': ie_result.get('uploader_id'),
1055 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
1056 'extractor': ie_result['extractor'],
1057 'webpage_url': ie_result['webpage_url'],
1058 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1059 'extractor_key': ie_result['extractor_key'],
1060 }
1061
1062 reason = self._match_entry(entry, incomplete=True)
1063 if reason is not None:
1064 if reason.endswith('has already been recorded in the archive') and self.params.get('break_on_existing'):
1065 print('[download] tried downloading a file that\'s already in the archive, stopping since --break-on-existing is set.')
1066 break
1067 else:
1068 self.to_screen('[download] ' + reason)
1069 continue
1070
1071 entry_result = self.__process_iterable_entry(entry, download, extra)
1072 # TODO: skip failed (empty) entries?
1073 playlist_results.append(entry_result)
1074 ie_result['entries'] = playlist_results
1075 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1076 return ie_result
1077 elif result_type == 'compat_list':
1078 self.report_warning(
1079 'Extractor %s returned a compat_list result. '
1080 'It needs to be updated.' % ie_result.get('extractor'))
1081
1082 def _fixup(r):
1083 self.add_extra_info(
1084 r,
1085 {
1086 'extractor': ie_result['extractor'],
1087 'webpage_url': ie_result['webpage_url'],
1088 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1089 'extractor_key': ie_result['extractor_key'],
1090 }
1091 )
1092 return r
1093 ie_result['entries'] = [
1094 self.process_ie_result(_fixup(r), download, extra_info)
1095 for r in ie_result['entries']
1096 ]
1097 return ie_result
1098 else:
1099 raise Exception('Invalid result type: %s' % result_type)
1100
1101 @__handle_extraction_exceptions
1102 def __process_iterable_entry(self, entry, download, extra_info):
1103 return self.process_ie_result(
1104 entry, download=download, extra_info=extra_info)
1105
1106 def _build_format_filter(self, filter_spec):
1107 " Returns a function to filter the formats according to the filter_spec "
1108
1109 OPERATORS = {
1110 '<': operator.lt,
1111 '<=': operator.le,
1112 '>': operator.gt,
1113 '>=': operator.ge,
1114 '=': operator.eq,
1115 '!=': operator.ne,
1116 }
1117 operator_rex = re.compile(r'''(?x)\s*
1118 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
1119 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1120 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1121 $
1122 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1123 m = operator_rex.search(filter_spec)
1124 if m:
1125 try:
1126 comparison_value = int(m.group('value'))
1127 except ValueError:
1128 comparison_value = parse_filesize(m.group('value'))
1129 if comparison_value is None:
1130 comparison_value = parse_filesize(m.group('value') + 'B')
1131 if comparison_value is None:
1132 raise ValueError(
1133 'Invalid value %r in format specification %r' % (
1134 m.group('value'), filter_spec))
1135 op = OPERATORS[m.group('op')]
1136
1137 if not m:
1138 STR_OPERATORS = {
1139 '=': operator.eq,
1140 '^=': lambda attr, value: attr.startswith(value),
1141 '$=': lambda attr, value: attr.endswith(value),
1142 '*=': lambda attr, value: value in attr,
1143 }
1144 str_operator_rex = re.compile(r'''(?x)
1145 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1146 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
1147 \s*(?P<value>[a-zA-Z0-9._-]+)
1148 \s*$
1149 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1150 m = str_operator_rex.search(filter_spec)
1151 if m:
1152 comparison_value = m.group('value')
1153 str_op = STR_OPERATORS[m.group('op')]
1154 if m.group('negation'):
1155 op = lambda attr, value: not str_op(attr, value)
1156 else:
1157 op = str_op
1158
1159 if not m:
1160 raise ValueError('Invalid filter specification %r' % filter_spec)
1161
1162 def _filter(f):
1163 actual_value = f.get(m.group('key'))
1164 if actual_value is None:
1165 return m.group('none_inclusive')
1166 return op(actual_value, comparison_value)
1167 return _filter
1168
1169 def _default_format_spec(self, info_dict, download=True):
1170
1171 def can_merge():
1172 merger = FFmpegMergerPP(self)
1173 return merger.available and merger.can_merge()
1174
1175 def prefer_best():
1176 if self.params.get('simulate', False):
1177 return False
1178 if not download:
1179 return False
1180 if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1181 return True
1182 if info_dict.get('is_live'):
1183 return True
1184 if not can_merge():
1185 return True
1186 return False
1187
1188 req_format_list = ['bestvideo+bestaudio', 'best']
1189 if prefer_best():
1190 req_format_list.reverse()
1191 return '/'.join(req_format_list)
1192
1193 def build_format_selector(self, format_spec):
1194 def syntax_error(note, start):
1195 message = (
1196 'Invalid format specification: '
1197 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1198 return SyntaxError(message)
1199
1200 PICKFIRST = 'PICKFIRST'
1201 MERGE = 'MERGE'
1202 SINGLE = 'SINGLE'
1203 GROUP = 'GROUP'
1204 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1205
1206 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', True),
1207 'video': self.params.get('allow_multiple_video_streams', True)}
1208
1209 def _parse_filter(tokens):
1210 filter_parts = []
1211 for type, string, start, _, _ in tokens:
1212 if type == tokenize.OP and string == ']':
1213 return ''.join(filter_parts)
1214 else:
1215 filter_parts.append(string)
1216
1217 def _remove_unused_ops(tokens):
1218 # Remove operators that we don't use and join them with the surrounding strings
1219 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1220 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1221 last_string, last_start, last_end, last_line = None, None, None, None
1222 for type, string, start, end, line in tokens:
1223 if type == tokenize.OP and string == '[':
1224 if last_string:
1225 yield tokenize.NAME, last_string, last_start, last_end, last_line
1226 last_string = None
1227 yield type, string, start, end, line
1228 # everything inside brackets will be handled by _parse_filter
1229 for type, string, start, end, line in tokens:
1230 yield type, string, start, end, line
1231 if type == tokenize.OP and string == ']':
1232 break
1233 elif type == tokenize.OP and string in ALLOWED_OPS:
1234 if last_string:
1235 yield tokenize.NAME, last_string, last_start, last_end, last_line
1236 last_string = None
1237 yield type, string, start, end, line
1238 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1239 if not last_string:
1240 last_string = string
1241 last_start = start
1242 last_end = end
1243 else:
1244 last_string += string
1245 if last_string:
1246 yield tokenize.NAME, last_string, last_start, last_end, last_line
1247
1248 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1249 selectors = []
1250 current_selector = None
1251 for type, string, start, _, _ in tokens:
1252 # ENCODING is only defined in python 3.x
1253 if type == getattr(tokenize, 'ENCODING', None):
1254 continue
1255 elif type in [tokenize.NAME, tokenize.NUMBER]:
1256 current_selector = FormatSelector(SINGLE, string, [])
1257 elif type == tokenize.OP:
1258 if string == ')':
1259 if not inside_group:
1260 # ')' will be handled by the parentheses group
1261 tokens.restore_last_token()
1262 break
1263 elif inside_merge and string in ['/', ',']:
1264 tokens.restore_last_token()
1265 break
1266 elif inside_choice and string == ',':
1267 tokens.restore_last_token()
1268 break
1269 elif string == ',':
1270 if not current_selector:
1271 raise syntax_error('"," must follow a format selector', start)
1272 selectors.append(current_selector)
1273 current_selector = None
1274 elif string == '/':
1275 if not current_selector:
1276 raise syntax_error('"/" must follow a format selector', start)
1277 first_choice = current_selector
1278 second_choice = _parse_format_selection(tokens, inside_choice=True)
1279 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1280 elif string == '[':
1281 if not current_selector:
1282 current_selector = FormatSelector(SINGLE, 'best', [])
1283 format_filter = _parse_filter(tokens)
1284 current_selector.filters.append(format_filter)
1285 elif string == '(':
1286 if current_selector:
1287 raise syntax_error('Unexpected "("', start)
1288 group = _parse_format_selection(tokens, inside_group=True)
1289 current_selector = FormatSelector(GROUP, group, [])
1290 elif string == '+':
1291 if not current_selector:
1292 raise syntax_error('Unexpected "+"', start)
1293 selector_1 = current_selector
1294 selector_2 = _parse_format_selection(tokens, inside_merge=True)
1295 if not selector_2:
1296 raise syntax_error('Expected a selector', start)
1297 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
1298 else:
1299 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1300 elif type == tokenize.ENDMARKER:
1301 break
1302 if current_selector:
1303 selectors.append(current_selector)
1304 return selectors
1305
1306 def _build_selector_function(selector):
1307 if isinstance(selector, list): # ,
1308 fs = [_build_selector_function(s) for s in selector]
1309
1310 def selector_function(ctx):
1311 for f in fs:
1312 for format in f(ctx):
1313 yield format
1314 return selector_function
1315
1316 elif selector.type == GROUP: # ()
1317 selector_function = _build_selector_function(selector.selector)
1318
1319 elif selector.type == PICKFIRST: # /
1320 fs = [_build_selector_function(s) for s in selector.selector]
1321
1322 def selector_function(ctx):
1323 for f in fs:
1324 picked_formats = list(f(ctx))
1325 if picked_formats:
1326 return picked_formats
1327 return []
1328
1329 elif selector.type == SINGLE: # atom
1330 format_spec = selector.selector if selector.selector is not None else 'best'
1331
1332 if format_spec == 'all':
1333 def selector_function(ctx):
1334 formats = list(ctx['formats'])
1335 if formats:
1336 for f in formats:
1337 yield f
1338
1339 else:
1340 format_fallback = False
1341 format_spec_obj = re.match(r'(best|worst|b|w)(video|audio|v|a)?(\*)?$', format_spec)
1342 if format_spec_obj is not None:
1343 format_idx = 0 if format_spec_obj.group(1)[0] == 'w' else -1
1344 format_type = format_spec_obj.group(2)[0] if format_spec_obj.group(2) else False
1345 not_format_type = 'v' if format_type == 'a' else 'a'
1346 format_modified = format_spec_obj.group(3) is not None
1347
1348 format_fallback = not format_type and not format_modified # for b, w
1349 filter_f = ((lambda f: f.get(format_type + 'codec') != 'none')
1350 if format_type and format_modified # bv*, ba*, wv*, wa*
1351 else (lambda f: f.get(not_format_type + 'codec') == 'none')
1352 if format_type # bv, ba, wv, wa
1353 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
1354 if not format_modified # b, w
1355 else None) # b*, w*
1356 else:
1357 format_idx = -1
1358 filter_f = ((lambda f: f.get('ext') == format_spec)
1359 if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension
1360 else (lambda f: f.get('format_id') == format_spec)) # id
1361
1362 def selector_function(ctx):
1363 formats = list(ctx['formats'])
1364 if not formats:
1365 return
1366 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
1367 if matches:
1368 yield matches[format_idx]
1369 elif format_fallback == 'force' or (format_fallback and ctx['incomplete_formats']):
1370 # for extractors with incomplete formats (audio only (soundcloud)
1371 # or video only (imgur)) best/worst will fallback to
1372 # best/worst {video,audio}-only format
1373 yield formats[format_idx]
1374
1375 elif selector.type == MERGE: # +
1376 def _merge(formats_pair):
1377 format_1, format_2 = formats_pair
1378
1379 formats_info = []
1380 formats_info.extend(format_1.get('requested_formats', (format_1,)))
1381 formats_info.extend(format_2.get('requested_formats', (format_2,)))
1382
1383 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
1384 get_no_more = {"video": False, "audio": False}
1385 for (i, fmt_info) in enumerate(formats_info):
1386 for aud_vid in ["audio", "video"]:
1387 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
1388 if get_no_more[aud_vid]:
1389 formats_info.pop(i)
1390 get_no_more[aud_vid] = True
1391
1392 if len(formats_info) == 1:
1393 return formats_info[0]
1394
1395 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
1396 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
1397
1398 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
1399 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
1400
1401 output_ext = self.params.get('merge_output_format')
1402 if not output_ext:
1403 if the_only_video:
1404 output_ext = the_only_video['ext']
1405 elif the_only_audio and not video_fmts:
1406 output_ext = the_only_audio['ext']
1407 else:
1408 output_ext = 'mkv'
1409
1410 new_dict = {
1411 'requested_formats': formats_info,
1412 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info),
1413 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info),
1414 'ext': output_ext,
1415 }
1416
1417 if the_only_video:
1418 new_dict.update({
1419 'width': the_only_video.get('width'),
1420 'height': the_only_video.get('height'),
1421 'resolution': the_only_video.get('resolution'),
1422 'fps': the_only_video.get('fps'),
1423 'vcodec': the_only_video.get('vcodec'),
1424 'vbr': the_only_video.get('vbr'),
1425 'stretched_ratio': the_only_video.get('stretched_ratio'),
1426 })
1427
1428 if the_only_audio:
1429 new_dict.update({
1430 'acodec': the_only_audio.get('acodec'),
1431 'abr': the_only_audio.get('abr'),
1432 })
1433
1434 return new_dict
1435
1436 selector_1, selector_2 = map(_build_selector_function, selector.selector)
1437
1438 def selector_function(ctx):
1439 for pair in itertools.product(
1440 selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
1441 yield _merge(pair)
1442
1443 filters = [self._build_format_filter(f) for f in selector.filters]
1444
1445 def final_selector(ctx):
1446 ctx_copy = copy.deepcopy(ctx)
1447 for _filter in filters:
1448 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1449 return selector_function(ctx_copy)
1450 return final_selector
1451
1452 stream = io.BytesIO(format_spec.encode('utf-8'))
1453 try:
1454 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1455 except tokenize.TokenError:
1456 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1457
1458 class TokenIterator(object):
1459 def __init__(self, tokens):
1460 self.tokens = tokens
1461 self.counter = 0
1462
1463 def __iter__(self):
1464 return self
1465
1466 def __next__(self):
1467 if self.counter >= len(self.tokens):
1468 raise StopIteration()
1469 value = self.tokens[self.counter]
1470 self.counter += 1
1471 return value
1472
1473 next = __next__
1474
1475 def restore_last_token(self):
1476 self.counter -= 1
1477
1478 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1479 return _build_selector_function(parsed_selector)
1480
1481 def _calc_headers(self, info_dict):
1482 res = std_headers.copy()
1483
1484 add_headers = info_dict.get('http_headers')
1485 if add_headers:
1486 res.update(add_headers)
1487
1488 cookies = self._calc_cookies(info_dict)
1489 if cookies:
1490 res['Cookie'] = cookies
1491
1492 if 'X-Forwarded-For' not in res:
1493 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1494 if x_forwarded_for_ip:
1495 res['X-Forwarded-For'] = x_forwarded_for_ip
1496
1497 return res
1498
1499 def _calc_cookies(self, info_dict):
1500 pr = sanitized_Request(info_dict['url'])
1501 self.cookiejar.add_cookie_header(pr)
1502 return pr.get_header('Cookie')
1503
1504 def process_video_result(self, info_dict, download=True):
1505 assert info_dict.get('_type', 'video') == 'video'
1506
1507 if 'id' not in info_dict:
1508 raise ExtractorError('Missing "id" field in extractor result')
1509 if 'title' not in info_dict:
1510 raise ExtractorError('Missing "title" field in extractor result')
1511
1512 def report_force_conversion(field, field_not, conversion):
1513 self.report_warning(
1514 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1515 % (field, field_not, conversion))
1516
1517 def sanitize_string_field(info, string_field):
1518 field = info.get(string_field)
1519 if field is None or isinstance(field, compat_str):
1520 return
1521 report_force_conversion(string_field, 'a string', 'string')
1522 info[string_field] = compat_str(field)
1523
1524 def sanitize_numeric_fields(info):
1525 for numeric_field in self._NUMERIC_FIELDS:
1526 field = info.get(numeric_field)
1527 if field is None or isinstance(field, compat_numeric_types):
1528 continue
1529 report_force_conversion(numeric_field, 'numeric', 'int')
1530 info[numeric_field] = int_or_none(field)
1531
1532 sanitize_string_field(info_dict, 'id')
1533 sanitize_numeric_fields(info_dict)
1534
1535 if 'playlist' not in info_dict:
1536 # It isn't part of a playlist
1537 info_dict['playlist'] = None
1538 info_dict['playlist_index'] = None
1539
1540 thumbnails = info_dict.get('thumbnails')
1541 if thumbnails is None:
1542 thumbnail = info_dict.get('thumbnail')
1543 if thumbnail:
1544 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1545 if thumbnails:
1546 thumbnails.sort(key=lambda t: (
1547 t.get('preference') if t.get('preference') is not None else -1,
1548 t.get('width') if t.get('width') is not None else -1,
1549 t.get('height') if t.get('height') is not None else -1,
1550 t.get('id') if t.get('id') is not None else '', t.get('url')))
1551 for i, t in enumerate(thumbnails):
1552 t['url'] = sanitize_url(t['url'])
1553 if t.get('width') and t.get('height'):
1554 t['resolution'] = '%dx%d' % (t['width'], t['height'])
1555 if t.get('id') is None:
1556 t['id'] = '%d' % i
1557
1558 if self.params.get('list_thumbnails'):
1559 self.list_thumbnails(info_dict)
1560 return
1561
1562 thumbnail = info_dict.get('thumbnail')
1563 if thumbnail:
1564 info_dict['thumbnail'] = sanitize_url(thumbnail)
1565 elif thumbnails:
1566 info_dict['thumbnail'] = thumbnails[-1]['url']
1567
1568 if 'display_id' not in info_dict and 'id' in info_dict:
1569 info_dict['display_id'] = info_dict['id']
1570
1571 if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1572 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1573 # see http://bugs.python.org/issue1646728)
1574 try:
1575 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1576 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1577 except (ValueError, OverflowError, OSError):
1578 pass
1579
1580 # Auto generate title fields corresponding to the *_number fields when missing
1581 # in order to always have clean titles. This is very common for TV series.
1582 for field in ('chapter', 'season', 'episode'):
1583 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1584 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1585
1586 for cc_kind in ('subtitles', 'automatic_captions'):
1587 cc = info_dict.get(cc_kind)
1588 if cc:
1589 for _, subtitle in cc.items():
1590 for subtitle_format in subtitle:
1591 if subtitle_format.get('url'):
1592 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1593 if subtitle_format.get('ext') is None:
1594 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1595
1596 automatic_captions = info_dict.get('automatic_captions')
1597 subtitles = info_dict.get('subtitles')
1598
1599 if self.params.get('listsubtitles', False):
1600 if 'automatic_captions' in info_dict:
1601 self.list_subtitles(
1602 info_dict['id'], automatic_captions, 'automatic captions')
1603 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1604 return
1605
1606 info_dict['requested_subtitles'] = self.process_subtitles(
1607 info_dict['id'], subtitles, automatic_captions)
1608
1609 # We now pick which formats have to be downloaded
1610 if info_dict.get('formats') is None:
1611 # There's only one format available
1612 formats = [info_dict]
1613 else:
1614 formats = info_dict['formats']
1615
1616 if not formats:
1617 raise ExtractorError('No video formats found!')
1618
1619 def is_wellformed(f):
1620 url = f.get('url')
1621 if not url:
1622 self.report_warning(
1623 '"url" field is missing or empty - skipping format, '
1624 'there is an error in extractor')
1625 return False
1626 if isinstance(url, bytes):
1627 sanitize_string_field(f, 'url')
1628 return True
1629
1630 # Filter out malformed formats for better extraction robustness
1631 formats = list(filter(is_wellformed, formats))
1632
1633 formats_dict = {}
1634
1635 # We check that all the formats have the format and format_id fields
1636 for i, format in enumerate(formats):
1637 sanitize_string_field(format, 'format_id')
1638 sanitize_numeric_fields(format)
1639 format['url'] = sanitize_url(format['url'])
1640 if not format.get('format_id'):
1641 format['format_id'] = compat_str(i)
1642 else:
1643 # Sanitize format_id from characters used in format selector expression
1644 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1645 format_id = format['format_id']
1646 if format_id not in formats_dict:
1647 formats_dict[format_id] = []
1648 formats_dict[format_id].append(format)
1649
1650 # Make sure all formats have unique format_id
1651 for format_id, ambiguous_formats in formats_dict.items():
1652 if len(ambiguous_formats) > 1:
1653 for i, format in enumerate(ambiguous_formats):
1654 format['format_id'] = '%s-%d' % (format_id, i)
1655
1656 for i, format in enumerate(formats):
1657 if format.get('format') is None:
1658 format['format'] = '{id} - {res}{note}'.format(
1659 id=format['format_id'],
1660 res=self.format_resolution(format),
1661 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1662 )
1663 # Automatically determine file extension if missing
1664 if format.get('ext') is None:
1665 format['ext'] = determine_ext(format['url']).lower()
1666 # Automatically determine protocol if missing (useful for format
1667 # selection purposes)
1668 if format.get('protocol') is None:
1669 format['protocol'] = determine_protocol(format)
1670 # Add HTTP headers, so that external programs can use them from the
1671 # json output
1672 full_format_info = info_dict.copy()
1673 full_format_info.update(format)
1674 format['http_headers'] = self._calc_headers(full_format_info)
1675 # Remove private housekeeping stuff
1676 if '__x_forwarded_for_ip' in info_dict:
1677 del info_dict['__x_forwarded_for_ip']
1678
1679 # TODO Central sorting goes here
1680
1681 if formats[0] is not info_dict:
1682 # only set the 'formats' fields if the original info_dict list them
1683 # otherwise we end up with a circular reference, the first (and unique)
1684 # element in the 'formats' field in info_dict is info_dict itself,
1685 # which can't be exported to json
1686 info_dict['formats'] = formats
1687 if self.params.get('listformats'):
1688 self.list_formats(info_dict)
1689 return
1690
1691 req_format = self.params.get('format')
1692 if req_format is None:
1693 req_format = self._default_format_spec(info_dict, download=download)
1694 if self.params.get('verbose'):
1695 self.to_stdout('[debug] Default format spec: %s' % req_format)
1696
1697 format_selector = self.build_format_selector(req_format)
1698
1699 # While in format selection we may need to have an access to the original
1700 # format set in order to calculate some metrics or do some processing.
1701 # For now we need to be able to guess whether original formats provided
1702 # by extractor are incomplete or not (i.e. whether extractor provides only
1703 # video-only or audio-only formats) for proper formats selection for
1704 # extractors with such incomplete formats (see
1705 # https://github.com/ytdl-org/youtube-dl/pull/5556).
1706 # Since formats may be filtered during format selection and may not match
1707 # the original formats the results may be incorrect. Thus original formats
1708 # or pre-calculated metrics should be passed to format selection routines
1709 # as well.
1710 # We will pass a context object containing all necessary additional data
1711 # instead of just formats.
1712 # This fixes incorrect format selection issue (see
1713 # https://github.com/ytdl-org/youtube-dl/issues/10083).
1714 incomplete_formats = (
1715 # All formats are video-only or
1716 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
1717 # all formats are audio-only
1718 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1719
1720 ctx = {
1721 'formats': formats,
1722 'incomplete_formats': incomplete_formats,
1723 }
1724
1725 formats_to_download = list(format_selector(ctx))
1726 if not formats_to_download:
1727 raise ExtractorError('requested format not available',
1728 expected=True)
1729
1730 if download:
1731 self.to_screen('[info] Downloading format(s) %s' % ", ".join([f['format_id'] for f in formats_to_download]))
1732 if len(formats_to_download) > 1:
1733 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1734 for format in formats_to_download:
1735 new_info = dict(info_dict)
1736 new_info.update(format)
1737 self.process_info(new_info)
1738 # We update the info dict with the best quality format (backwards compatibility)
1739 info_dict.update(formats_to_download[-1])
1740 return info_dict
1741
1742 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1743 """Select the requested subtitles and their format"""
1744 available_subs = {}
1745 if normal_subtitles and self.params.get('writesubtitles'):
1746 available_subs.update(normal_subtitles)
1747 if automatic_captions and self.params.get('writeautomaticsub'):
1748 for lang, cap_info in automatic_captions.items():
1749 if lang not in available_subs:
1750 available_subs[lang] = cap_info
1751
1752 if (not self.params.get('writesubtitles') and not
1753 self.params.get('writeautomaticsub') or not
1754 available_subs):
1755 return None
1756
1757 if self.params.get('allsubtitles', False):
1758 requested_langs = available_subs.keys()
1759 else:
1760 if self.params.get('subtitleslangs', False):
1761 requested_langs = self.params.get('subtitleslangs')
1762 elif 'en' in available_subs:
1763 requested_langs = ['en']
1764 else:
1765 requested_langs = [list(available_subs.keys())[0]]
1766
1767 formats_query = self.params.get('subtitlesformat', 'best')
1768 formats_preference = formats_query.split('/') if formats_query else []
1769 subs = {}
1770 for lang in requested_langs:
1771 formats = available_subs.get(lang)
1772 if formats is None:
1773 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1774 continue
1775 for ext in formats_preference:
1776 if ext == 'best':
1777 f = formats[-1]
1778 break
1779 matches = list(filter(lambda f: f['ext'] == ext, formats))
1780 if matches:
1781 f = matches[-1]
1782 break
1783 else:
1784 f = formats[-1]
1785 self.report_warning(
1786 'No subtitle format found matching "%s" for language %s, '
1787 'using %s' % (formats_query, lang, f['ext']))
1788 subs[lang] = f
1789 return subs
1790
1791 def __forced_printings(self, info_dict, filename, incomplete):
1792 def print_mandatory(field):
1793 if (self.params.get('force%s' % field, False)
1794 and (not incomplete or info_dict.get(field) is not None)):
1795 self.to_stdout(info_dict[field])
1796
1797 def print_optional(field):
1798 if (self.params.get('force%s' % field, False)
1799 and info_dict.get(field) is not None):
1800 self.to_stdout(info_dict[field])
1801
1802 print_mandatory('title')
1803 print_mandatory('id')
1804 if self.params.get('forceurl', False) and not incomplete:
1805 if info_dict.get('requested_formats') is not None:
1806 for f in info_dict['requested_formats']:
1807 self.to_stdout(f['url'] + f.get('play_path', ''))
1808 else:
1809 # For RTMP URLs, also include the playpath
1810 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1811 print_optional('thumbnail')
1812 print_optional('description')
1813 if self.params.get('forcefilename', False) and filename is not None:
1814 self.to_stdout(filename)
1815 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1816 self.to_stdout(formatSeconds(info_dict['duration']))
1817 print_mandatory('format')
1818 if self.params.get('forcejson', False):
1819 self.to_stdout(json.dumps(info_dict))
1820
1821 def process_info(self, info_dict):
1822 """Process a single resolved IE result."""
1823
1824 assert info_dict.get('_type', 'video') == 'video'
1825
1826 max_downloads = self.params.get('max_downloads')
1827 if max_downloads is not None:
1828 if self._num_downloads >= int(max_downloads):
1829 raise MaxDownloadsReached()
1830
1831 # TODO: backward compatibility, to be removed
1832 info_dict['fulltitle'] = info_dict['title']
1833
1834 if 'format' not in info_dict:
1835 info_dict['format'] = info_dict['ext']
1836
1837 reason = self._match_entry(info_dict, incomplete=False)
1838 if reason is not None:
1839 self.to_screen('[download] ' + reason)
1840 return
1841
1842 self._num_downloads += 1
1843
1844 info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1845
1846 # Forced printings
1847 self.__forced_printings(info_dict, filename, incomplete=False)
1848
1849 # Do nothing else if in simulate mode
1850 if self.params.get('simulate', False):
1851 return
1852
1853 if filename is None:
1854 return
1855
1856 def ensure_dir_exists(path):
1857 try:
1858 dn = os.path.dirname(path)
1859 if dn and not os.path.exists(dn):
1860 os.makedirs(dn)
1861 return True
1862 except (OSError, IOError) as err:
1863 self.report_error('unable to create directory ' + error_to_compat_str(err))
1864 return False
1865
1866 if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1867 return
1868
1869 if self.params.get('writedescription', False):
1870 descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1871 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1872 self.to_screen('[info] Video description is already present')
1873 elif info_dict.get('description') is None:
1874 self.report_warning('There\'s no description to write.')
1875 else:
1876 try:
1877 self.to_screen('[info] Writing video description to: ' + descfn)
1878 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1879 descfile.write(info_dict['description'])
1880 except (OSError, IOError):
1881 self.report_error('Cannot write description file ' + descfn)
1882 return
1883
1884 if self.params.get('writeannotations', False):
1885 annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1886 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1887 self.to_screen('[info] Video annotations are already present')
1888 elif not info_dict.get('annotations'):
1889 self.report_warning('There are no annotations to write.')
1890 else:
1891 try:
1892 self.to_screen('[info] Writing video annotations to: ' + annofn)
1893 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1894 annofile.write(info_dict['annotations'])
1895 except (KeyError, TypeError):
1896 self.report_warning('There are no annotations to write.')
1897 except (OSError, IOError):
1898 self.report_error('Cannot write annotations file: ' + annofn)
1899 return
1900
1901 def dl(name, info, subtitle=False):
1902 fd = get_suitable_downloader(info, self.params)(self, self.params)
1903 for ph in self._progress_hooks:
1904 fd.add_progress_hook(ph)
1905 if self.params.get('verbose'):
1906 self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1907 return fd.download(name, info, subtitle)
1908
1909 subtitles_are_requested = any([self.params.get('writesubtitles', False),
1910 self.params.get('writeautomaticsub')])
1911
1912 if subtitles_are_requested and info_dict.get('requested_subtitles'):
1913 # subtitles download errors are already managed as troubles in relevant IE
1914 # that way it will silently go on when used with unsupporting IE
1915 subtitles = info_dict['requested_subtitles']
1916 # ie = self.get_info_extractor(info_dict['extractor_key'])
1917 for sub_lang, sub_info in subtitles.items():
1918 sub_format = sub_info['ext']
1919 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
1920 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1921 self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1922 else:
1923 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1924 if sub_info.get('data') is not None:
1925 try:
1926 # Use newline='' to prevent conversion of newline characters
1927 # See https://github.com/ytdl-org/youtube-dl/issues/10268
1928 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1929 subfile.write(sub_info['data'])
1930 except (OSError, IOError):
1931 self.report_error('Cannot write subtitles file ' + sub_filename)
1932 return
1933 else:
1934 try:
1935 dl(sub_filename, sub_info, subtitle=True)
1936 '''
1937 if self.params.get('sleep_interval_subtitles', False):
1938 dl(sub_filename, sub_info)
1939 else:
1940 sub_data = ie._request_webpage(
1941 sub_info['url'], info_dict['id'], note=False).read()
1942 with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1943 subfile.write(sub_data)
1944 '''
1945 except (ExtractorError, IOError, OSError, ValueError, compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1946 self.report_warning('Unable to download subtitle for "%s": %s' %
1947 (sub_lang, error_to_compat_str(err)))
1948 continue
1949
1950 if self.params.get('skip_download', False):
1951 if self.params.get('convertsubtitles', False):
1952 subconv = FFmpegSubtitlesConvertorPP(self, format=self.params.get('convertsubtitles'))
1953 filename_real_ext = os.path.splitext(filename)[1][1:]
1954 filename_wo_ext = (
1955 os.path.splitext(filename)[0]
1956 if filename_real_ext == info_dict['ext']
1957 else filename)
1958 afilename = '%s.%s' % (filename_wo_ext, self.params.get('convertsubtitles'))
1959 if subconv.available:
1960 info_dict.setdefault('__postprocessors', [])
1961 # info_dict['__postprocessors'].append(subconv)
1962 if os.path.exists(encodeFilename(afilename)):
1963 self.to_screen(
1964 '[download] %s has already been downloaded and '
1965 'converted' % afilename)
1966 else:
1967 try:
1968 self.post_process(filename, info_dict)
1969 except (PostProcessingError) as err:
1970 self.report_error('postprocessing: %s' % str(err))
1971 return
1972
1973 if self.params.get('writeinfojson', False):
1974 infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1975 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1976 self.to_screen('[info] Video description metadata is already present')
1977 else:
1978 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1979 try:
1980 write_json_file(self.filter_requested_info(info_dict), infofn)
1981 except (OSError, IOError):
1982 self.report_error('Cannot write metadata to JSON file ' + infofn)
1983 return
1984
1985 self._write_thumbnails(info_dict, filename)
1986
1987 if not self.params.get('skip_download', False):
1988 try:
1989 if info_dict.get('requested_formats') is not None:
1990 downloaded = []
1991 success = True
1992 merger = FFmpegMergerPP(self)
1993 if not merger.available:
1994 postprocessors = []
1995 self.report_warning('You have requested multiple '
1996 'formats but ffmpeg or avconv are not installed.'
1997 ' The formats won\'t be merged.')
1998 else:
1999 postprocessors = [merger]
2000
2001 def compatible_formats(formats):
2002 # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
2003 video_formats = [format for format in formats if format.get('vcodec') != 'none']
2004 audio_formats = [format for format in formats if format.get('acodec') != 'none']
2005 if len(video_formats) > 2 or len(audio_formats) > 2:
2006 return False
2007
2008 # Check extension
2009 exts = set(format.get('ext') for format in formats)
2010 COMPATIBLE_EXTS = (
2011 set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')),
2012 set(('webm',)),
2013 )
2014 for ext_sets in COMPATIBLE_EXTS:
2015 if ext_sets.issuperset(exts):
2016 return True
2017 # TODO: Check acodec/vcodec
2018 return False
2019
2020 filename_real_ext = os.path.splitext(filename)[1][1:]
2021 filename_wo_ext = (
2022 os.path.splitext(filename)[0]
2023 if filename_real_ext == info_dict['ext']
2024 else filename)
2025 requested_formats = info_dict['requested_formats']
2026 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
2027 info_dict['ext'] = 'mkv'
2028 self.report_warning(
2029 'Requested formats are incompatible for merge and will be merged into mkv.')
2030 # Ensure filename always has a correct extension for successful merge
2031 filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
2032 if os.path.exists(encodeFilename(filename)):
2033 self.to_screen(
2034 '[download] %s has already been downloaded and '
2035 'merged' % filename)
2036 else:
2037 for f in requested_formats:
2038 new_info = dict(info_dict)
2039 new_info.update(f)
2040 fname = prepend_extension(
2041 self.prepare_filename(new_info),
2042 'f%s' % f['format_id'], new_info['ext'])
2043 if not ensure_dir_exists(fname):
2044 return
2045 downloaded.append(fname)
2046 partial_success = dl(fname, new_info)
2047 success = success and partial_success
2048 info_dict['__postprocessors'] = postprocessors
2049 info_dict['__files_to_merge'] = downloaded
2050 else:
2051 # Just a single file
2052 success = dl(filename, info_dict)
2053 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2054 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
2055 return
2056 except (OSError, IOError) as err:
2057 raise UnavailableVideoError(err)
2058 except (ContentTooShortError, ) as err:
2059 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
2060 return
2061
2062 if success and filename != '-':
2063 # Fixup content
2064 fixup_policy = self.params.get('fixup')
2065 if fixup_policy is None:
2066 fixup_policy = 'detect_or_warn'
2067
2068 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
2069
2070 stretched_ratio = info_dict.get('stretched_ratio')
2071 if stretched_ratio is not None and stretched_ratio != 1:
2072 if fixup_policy == 'warn':
2073 self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
2074 info_dict['id'], stretched_ratio))
2075 elif fixup_policy == 'detect_or_warn':
2076 stretched_pp = FFmpegFixupStretchedPP(self)
2077 if stretched_pp.available:
2078 info_dict.setdefault('__postprocessors', [])
2079 info_dict['__postprocessors'].append(stretched_pp)
2080 else:
2081 self.report_warning(
2082 '%s: Non-uniform pixel ratio (%s). %s'
2083 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
2084 else:
2085 assert fixup_policy in ('ignore', 'never')
2086
2087 if (info_dict.get('requested_formats') is None
2088 and info_dict.get('container') == 'm4a_dash'):
2089 if fixup_policy == 'warn':
2090 self.report_warning(
2091 '%s: writing DASH m4a. '
2092 'Only some players support this container.'
2093 % info_dict['id'])
2094 elif fixup_policy == 'detect_or_warn':
2095 fixup_pp = FFmpegFixupM4aPP(self)
2096 if fixup_pp.available:
2097 info_dict.setdefault('__postprocessors', [])
2098 info_dict['__postprocessors'].append(fixup_pp)
2099 else:
2100 self.report_warning(
2101 '%s: writing DASH m4a. '
2102 'Only some players support this container. %s'
2103 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
2104 else:
2105 assert fixup_policy in ('ignore', 'never')
2106
2107 if (info_dict.get('protocol') == 'm3u8_native'
2108 or info_dict.get('protocol') == 'm3u8'
2109 and self.params.get('hls_prefer_native')):
2110 if fixup_policy == 'warn':
2111 self.report_warning('%s: malformed AAC bitstream detected.' % (
2112 info_dict['id']))
2113 elif fixup_policy == 'detect_or_warn':
2114 fixup_pp = FFmpegFixupM3u8PP(self)
2115 if fixup_pp.available:
2116 info_dict.setdefault('__postprocessors', [])
2117 info_dict['__postprocessors'].append(fixup_pp)
2118 else:
2119 self.report_warning(
2120 '%s: malformed AAC bitstream detected. %s'
2121 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
2122 else:
2123 assert fixup_policy in ('ignore', 'never')
2124
2125 try:
2126 self.post_process(filename, info_dict)
2127 except (PostProcessingError) as err:
2128 self.report_error('postprocessing: %s' % str(err))
2129 return
2130 self.record_download_archive(info_dict)
2131
2132 def download(self, url_list):
2133 """Download a given list of URLs."""
2134 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
2135 if (len(url_list) > 1
2136 and outtmpl != '-'
2137 and '%' not in outtmpl
2138 and self.params.get('max_downloads') != 1):
2139 raise SameFileError(outtmpl)
2140
2141 for url in url_list:
2142 try:
2143 # It also downloads the videos
2144 res = self.extract_info(
2145 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2146 except UnavailableVideoError:
2147 self.report_error('unable to download video')
2148 except MaxDownloadsReached:
2149 self.to_screen('[info] Maximum number of downloaded files reached.')
2150 raise
2151 else:
2152 if self.params.get('dump_single_json', False):
2153 self.to_stdout(json.dumps(res))
2154
2155 return self._download_retcode
2156
2157 def download_with_info_file(self, info_filename):
2158 with contextlib.closing(fileinput.FileInput(
2159 [info_filename], mode='r',
2160 openhook=fileinput.hook_encoded('utf-8'))) as f:
2161 # FileInput doesn't have a read method, we can't call json.load
2162 info = self.filter_requested_info(json.loads('\n'.join(f)))
2163 try:
2164 self.process_ie_result(info, download=True)
2165 except DownloadError:
2166 webpage_url = info.get('webpage_url')
2167 if webpage_url is not None:
2168 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2169 return self.download([webpage_url])
2170 else:
2171 raise
2172 return self._download_retcode
2173
2174 @staticmethod
2175 def filter_requested_info(info_dict):
2176 return dict(
2177 (k, v) for k, v in info_dict.items()
2178 if k not in ['requested_formats', 'requested_subtitles'])
2179
2180 def post_process(self, filename, ie_info):
2181 """Run all the postprocessors on the given file."""
2182 info = dict(ie_info)
2183 info['filepath'] = filename
2184 pps_chain = []
2185 if ie_info.get('__postprocessors') is not None:
2186 pps_chain.extend(ie_info['__postprocessors'])
2187 pps_chain.extend(self._pps)
2188 for pp in pps_chain:
2189 files_to_delete = []
2190 try:
2191 files_to_delete, info = pp.run(info)
2192 except PostProcessingError as e:
2193 self.report_error(e.msg)
2194 if files_to_delete and not self.params.get('keepvideo', False):
2195 for old_filename in set(files_to_delete):
2196 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2197 try:
2198 os.remove(encodeFilename(old_filename))
2199 except (IOError, OSError):
2200 self.report_warning('Unable to remove downloaded original file')
2201
2202 def _make_archive_id(self, info_dict):
2203 video_id = info_dict.get('id')
2204 if not video_id:
2205 return
2206 # Future-proof against any change in case
2207 # and backwards compatibility with prior versions
2208 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
2209 if extractor is None:
2210 url = str_or_none(info_dict.get('url'))
2211 if not url:
2212 return
2213 # Try to find matching extractor for the URL and take its ie_key
2214 for ie in self._ies:
2215 if ie.suitable(url):
2216 extractor = ie.ie_key()
2217 break
2218 else:
2219 return
2220 return extractor.lower() + ' ' + video_id
2221
2222 def in_download_archive(self, info_dict):
2223 fn = self.params.get('download_archive')
2224 if fn is None:
2225 return False
2226
2227 vid_id = self._make_archive_id(info_dict)
2228 if not vid_id:
2229 return False # Incomplete video information
2230
2231 return vid_id in self.archive
2232
2233 def record_download_archive(self, info_dict):
2234 fn = self.params.get('download_archive')
2235 if fn is None:
2236 return
2237 vid_id = self._make_archive_id(info_dict)
2238 assert vid_id
2239 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2240 archive_file.write(vid_id + '\n')
2241 self.archive.add(vid_id)
2242
2243 @staticmethod
2244 def format_resolution(format, default='unknown'):
2245 if format.get('vcodec') == 'none':
2246 return 'audio only'
2247 if format.get('resolution') is not None:
2248 return format['resolution']
2249 if format.get('height') is not None:
2250 if format.get('width') is not None:
2251 res = '%sx%s' % (format['width'], format['height'])
2252 else:
2253 res = '%sp' % format['height']
2254 elif format.get('width') is not None:
2255 res = '%dx?' % format['width']
2256 else:
2257 res = default
2258 return res
2259
2260 def _format_note(self, fdict):
2261 res = ''
2262 if fdict.get('ext') in ['f4f', 'f4m']:
2263 res += '(unsupported) '
2264 if fdict.get('language'):
2265 if res:
2266 res += ' '
2267 res += '[%s] ' % fdict['language']
2268 if fdict.get('format_note') is not None:
2269 res += fdict['format_note'] + ' '
2270 if fdict.get('tbr') is not None:
2271 res += '%4dk ' % fdict['tbr']
2272 if fdict.get('container') is not None:
2273 if res:
2274 res += ', '
2275 res += '%s container' % fdict['container']
2276 if (fdict.get('vcodec') is not None
2277 and fdict.get('vcodec') != 'none'):
2278 if res:
2279 res += ', '
2280 res += fdict['vcodec']
2281 if fdict.get('vbr') is not None:
2282 res += '@'
2283 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2284 res += 'video@'
2285 if fdict.get('vbr') is not None:
2286 res += '%4dk' % fdict['vbr']
2287 if fdict.get('fps') is not None:
2288 if res:
2289 res += ', '
2290 res += '%sfps' % fdict['fps']
2291 if fdict.get('acodec') is not None:
2292 if res:
2293 res += ', '
2294 if fdict['acodec'] == 'none':
2295 res += 'video only'
2296 else:
2297 res += '%-5s' % fdict['acodec']
2298 elif fdict.get('abr') is not None:
2299 if res:
2300 res += ', '
2301 res += 'audio'
2302 if fdict.get('abr') is not None:
2303 res += '@%3dk' % fdict['abr']
2304 if fdict.get('asr') is not None:
2305 res += ' (%5dHz)' % fdict['asr']
2306 if fdict.get('filesize') is not None:
2307 if res:
2308 res += ', '
2309 res += format_bytes(fdict['filesize'])
2310 elif fdict.get('filesize_approx') is not None:
2311 if res:
2312 res += ', '
2313 res += '~' + format_bytes(fdict['filesize_approx'])
2314 return res
2315
2316 def list_formats(self, info_dict):
2317 formats = info_dict.get('formats', [info_dict])
2318 table = [
2319 [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2320 for f in formats
2321 if f.get('preference') is None or f['preference'] >= -1000]
2322 # if len(formats) > 1:
2323 # table[-1][-1] += (' ' if table[-1][-1] else '') + '(best*)'
2324
2325 header_line = ['format code', 'extension', 'resolution', 'note']
2326 self.to_screen(
2327 '[info] Available formats for %s:\n%s' %
2328 (info_dict['id'], render_table(header_line, table)))
2329
2330 def list_thumbnails(self, info_dict):
2331 thumbnails = info_dict.get('thumbnails')
2332 if not thumbnails:
2333 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2334 return
2335
2336 self.to_screen(
2337 '[info] Thumbnails for %s:' % info_dict['id'])
2338 self.to_screen(render_table(
2339 ['ID', 'width', 'height', 'URL'],
2340 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2341
2342 def list_subtitles(self, video_id, subtitles, name='subtitles'):
2343 if not subtitles:
2344 self.to_screen('%s has no %s' % (video_id, name))
2345 return
2346 self.to_screen(
2347 'Available %s for %s:' % (name, video_id))
2348 self.to_screen(render_table(
2349 ['Language', 'formats'],
2350 [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2351 for lang, formats in subtitles.items()]))
2352
2353 def urlopen(self, req):
2354 """ Start an HTTP download """
2355 if isinstance(req, compat_basestring):
2356 req = sanitized_Request(req)
2357 return self._opener.open(req, timeout=self._socket_timeout)
2358
2359 def print_debug_header(self):
2360 if not self.params.get('verbose'):
2361 return
2362
2363 if type('') is not compat_str:
2364 # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
2365 self.report_warning(
2366 'Your Python is broken! Update to a newer and supported version')
2367
2368 stdout_encoding = getattr(
2369 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2370 encoding_str = (
2371 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2372 locale.getpreferredencoding(),
2373 sys.getfilesystemencoding(),
2374 stdout_encoding,
2375 self.get_encoding()))
2376 write_string(encoding_str, encoding=None)
2377
2378 self._write_string('[debug] youtube-dlc version ' + __version__ + '\n')
2379 if _LAZY_LOADER:
2380 self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2381 try:
2382 sp = subprocess.Popen(
2383 ['git', 'rev-parse', '--short', 'HEAD'],
2384 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2385 cwd=os.path.dirname(os.path.abspath(__file__)))
2386 out, err = sp.communicate()
2387 out = out.decode().strip()
2388 if re.match('[0-9a-f]+', out):
2389 self._write_string('[debug] Git HEAD: ' + out + '\n')
2390 except Exception:
2391 try:
2392 sys.exc_clear()
2393 except Exception:
2394 pass
2395
2396 def python_implementation():
2397 impl_name = platform.python_implementation()
2398 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2399 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
2400 return impl_name
2401
2402 self._write_string('[debug] Python version %s (%s) - %s\n' % (
2403 platform.python_version(), python_implementation(),
2404 platform_name()))
2405
2406 exe_versions = FFmpegPostProcessor.get_versions(self)
2407 exe_versions['rtmpdump'] = rtmpdump_version()
2408 exe_versions['phantomjs'] = PhantomJSwrapper._version()
2409 exe_str = ', '.join(
2410 '%s %s' % (exe, v)
2411 for exe, v in sorted(exe_versions.items())
2412 if v
2413 )
2414 if not exe_str:
2415 exe_str = 'none'
2416 self._write_string('[debug] exe versions: %s\n' % exe_str)
2417
2418 proxy_map = {}
2419 for handler in self._opener.handlers:
2420 if hasattr(handler, 'proxies'):
2421 proxy_map.update(handler.proxies)
2422 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2423
2424 if self.params.get('call_home', False):
2425 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2426 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2427 latest_version = self.urlopen(
2428 'https://yt-dl.org/latest/version').read().decode('utf-8')
2429 if version_tuple(latest_version) > version_tuple(__version__):
2430 self.report_warning(
2431 'You are using an outdated version (newest version: %s)! '
2432 'See https://yt-dl.org/update if you need help updating.' %
2433 latest_version)
2434
2435 def _setup_opener(self):
2436 timeout_val = self.params.get('socket_timeout')
2437 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2438
2439 opts_cookiefile = self.params.get('cookiefile')
2440 opts_proxy = self.params.get('proxy')
2441
2442 if opts_cookiefile is None:
2443 self.cookiejar = compat_cookiejar.CookieJar()
2444 else:
2445 opts_cookiefile = expand_path(opts_cookiefile)
2446 self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
2447 if os.access(opts_cookiefile, os.R_OK):
2448 self.cookiejar.load(ignore_discard=True, ignore_expires=True)
2449
2450 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2451 if opts_proxy is not None:
2452 if opts_proxy == '':
2453 proxies = {}
2454 else:
2455 proxies = {'http': opts_proxy, 'https': opts_proxy}
2456 else:
2457 proxies = compat_urllib_request.getproxies()
2458 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
2459 if 'http' in proxies and 'https' not in proxies:
2460 proxies['https'] = proxies['http']
2461 proxy_handler = PerRequestProxyHandler(proxies)
2462
2463 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2464 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2465 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2466 redirect_handler = YoutubeDLRedirectHandler()
2467 data_handler = compat_urllib_request_DataHandler()
2468
2469 # When passing our own FileHandler instance, build_opener won't add the
2470 # default FileHandler and allows us to disable the file protocol, which
2471 # can be used for malicious purposes (see
2472 # https://github.com/ytdl-org/youtube-dl/issues/8227)
2473 file_handler = compat_urllib_request.FileHandler()
2474
2475 def file_open(*args, **kwargs):
2476 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dlc for security reasons')
2477 file_handler.file_open = file_open
2478
2479 opener = compat_urllib_request.build_opener(
2480 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
2481
2482 # Delete the default user-agent header, which would otherwise apply in
2483 # cases where our custom HTTP handler doesn't come into play
2484 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
2485 opener.addheaders = []
2486 self._opener = opener
2487
2488 def encode(self, s):
2489 if isinstance(s, bytes):
2490 return s # Already encoded
2491
2492 try:
2493 return s.encode(self.get_encoding())
2494 except UnicodeEncodeError as err:
2495 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2496 raise
2497
2498 def get_encoding(self):
2499 encoding = self.params.get('encoding')
2500 if encoding is None:
2501 encoding = preferredencoding()
2502 return encoding
2503
2504 def _write_thumbnails(self, info_dict, filename):
2505 if self.params.get('writethumbnail', False):
2506 thumbnails = info_dict.get('thumbnails')
2507 if thumbnails:
2508 thumbnails = [thumbnails[-1]]
2509 elif self.params.get('write_all_thumbnails', False):
2510 thumbnails = info_dict.get('thumbnails')
2511 else:
2512 return
2513
2514 if not thumbnails:
2515 # No thumbnails present, so return immediately
2516 return
2517
2518 for t in thumbnails:
2519 thumb_ext = determine_ext(t['url'], 'jpg')
2520 suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2521 thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2522 t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2523
2524 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2525 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2526 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2527 else:
2528 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2529 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2530 try:
2531 uf = self.urlopen(t['url'])
2532 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2533 shutil.copyfileobj(uf, thumbf)
2534 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2535 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2537 self.report_warning('Unable to download thumbnail "%s": %s' %
2538 (t['url'], error_to_compat_str(err)))