]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
[YoutubeDL] Check for bytes instead of unicode output templates (#5192)
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import io
12 import itertools
13 import json
14 import locale
15 import operator
16 import os
17 import platform
18 import re
19 import shutil
20 import subprocess
21 import socket
22 import sys
23 import time
24 import traceback
25
26 if os.name == 'nt':
27 import ctypes
28
29 from .compat import (
30 compat_basestring,
31 compat_cookiejar,
32 compat_expanduser,
33 compat_get_terminal_size,
34 compat_http_client,
35 compat_kwargs,
36 compat_str,
37 compat_urllib_error,
38 compat_urllib_request,
39 )
40 from .utils import (
41 escape_url,
42 ContentTooShortError,
43 date_from_str,
44 DateRange,
45 DEFAULT_OUTTMPL,
46 determine_ext,
47 DownloadError,
48 encodeFilename,
49 ExtractorError,
50 format_bytes,
51 formatSeconds,
52 locked_file,
53 make_HTTPS_handler,
54 MaxDownloadsReached,
55 PagedList,
56 parse_filesize,
57 PerRequestProxyHandler,
58 PostProcessingError,
59 platform_name,
60 preferredencoding,
61 render_table,
62 SameFileError,
63 sanitize_filename,
64 sanitize_path,
65 std_headers,
66 subtitles_filename,
67 takewhile_inclusive,
68 UnavailableVideoError,
69 url_basename,
70 version_tuple,
71 write_json_file,
72 write_string,
73 YoutubeDLHandler,
74 prepend_extension,
75 args_to_str,
76 age_restricted,
77 )
78 from .cache import Cache
79 from .extractor import get_info_extractor, gen_extractors
80 from .downloader import get_suitable_downloader
81 from .downloader.rtmp import rtmpdump_version
82 from .postprocessor import (
83 FFmpegFixupM4aPP,
84 FFmpegFixupStretchedPP,
85 FFmpegMergerPP,
86 FFmpegPostProcessor,
87 get_postprocessor,
88 )
89 from .version import __version__
90
91
92 class YoutubeDL(object):
93 """YoutubeDL class.
94
95 YoutubeDL objects are the ones responsible of downloading the
96 actual video file and writing it to disk if the user has requested
97 it, among some other tasks. In most cases there should be one per
98 program. As, given a video URL, the downloader doesn't know how to
99 extract all the needed information, task that InfoExtractors do, it
100 has to pass the URL to one of them.
101
102 For this, YoutubeDL objects have a method that allows
103 InfoExtractors to be registered in a given order. When it is passed
104 a URL, the YoutubeDL object handles it to the first InfoExtractor it
105 finds that reports being able to handle it. The InfoExtractor extracts
106 all the information about the video or videos the URL refers to, and
107 YoutubeDL process the extracted information, possibly using a File
108 Downloader to download the video.
109
110 YoutubeDL objects accept a lot of parameters. In order not to saturate
111 the object constructor with arguments, it receives a dictionary of
112 options instead. These options are available through the params
113 attribute for the InfoExtractors to use. The YoutubeDL also
114 registers itself as the downloader in charge for the InfoExtractors
115 that are added to it, so this is a "mutual registration".
116
117 Available options:
118
119 username: Username for authentication purposes.
120 password: Password for authentication purposes.
121 videopassword: Password for acces a video.
122 usenetrc: Use netrc for authentication instead.
123 verbose: Print additional info to stdout.
124 quiet: Do not print messages to stdout.
125 no_warnings: Do not print out anything for warnings.
126 forceurl: Force printing final URL.
127 forcetitle: Force printing title.
128 forceid: Force printing ID.
129 forcethumbnail: Force printing thumbnail URL.
130 forcedescription: Force printing description.
131 forcefilename: Force printing final filename.
132 forceduration: Force printing duration.
133 forcejson: Force printing info_dict as JSON.
134 dump_single_json: Force printing the info_dict of the whole playlist
135 (or video) as a single JSON line.
136 simulate: Do not download the video files.
137 format: Video format code. See options.py for more information.
138 format_limit: Highest quality format to try.
139 outtmpl: Template for output names.
140 restrictfilenames: Do not allow "&" and spaces in file names
141 ignoreerrors: Do not stop on download errors.
142 nooverwrites: Prevent overwriting files.
143 playliststart: Playlist item to start at.
144 playlistend: Playlist item to end at.
145 playlist_items: Specific indices of playlist to download.
146 playlistreverse: Download playlist items in reverse order.
147 matchtitle: Download only matching titles.
148 rejecttitle: Reject downloads for matching titles.
149 logger: Log messages to a logging.Logger instance.
150 logtostderr: Log messages to stderr instead of stdout.
151 writedescription: Write the video description to a .description file
152 writeinfojson: Write the video description to a .info.json file
153 writeannotations: Write the video annotations to a .annotations.xml file
154 writethumbnail: Write the thumbnail image to a file
155 write_all_thumbnails: Write all thumbnail formats to files
156 writesubtitles: Write the video subtitles to a file
157 writeautomaticsub: Write the automatic subtitles to a file
158 allsubtitles: Downloads all the subtitles of the video
159 (requires writesubtitles or writeautomaticsub)
160 listsubtitles: Lists all available subtitles for the video
161 subtitlesformat: The format code for subtitles
162 subtitleslangs: List of languages of the subtitles to download
163 keepvideo: Keep the video file after post-processing
164 daterange: A DateRange object, download only if the upload_date is in the range.
165 skip_download: Skip the actual download of the video file
166 cachedir: Location of the cache files in the filesystem.
167 False to disable filesystem cache.
168 noplaylist: Download single video instead of a playlist if in doubt.
169 age_limit: An integer representing the user's age in years.
170 Unsuitable videos for the given age are skipped.
171 min_views: An integer representing the minimum view count the video
172 must have in order to not be skipped.
173 Videos without view count information are always
174 downloaded. None for no limit.
175 max_views: An integer representing the maximum view count.
176 Videos that are more popular than that are not
177 downloaded.
178 Videos without view count information are always
179 downloaded. None for no limit.
180 download_archive: File name of a file where all downloads are recorded.
181 Videos already present in the file are not downloaded
182 again.
183 cookiefile: File name where cookies should be read from and dumped to.
184 nocheckcertificate:Do not verify SSL certificates
185 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
186 At the moment, this is only supported by YouTube.
187 proxy: URL of the proxy server to use
188 cn_verification_proxy: URL of the proxy to use for IP address verification
189 on Chinese sites. (Experimental)
190 socket_timeout: Time to wait for unresponsive hosts, in seconds
191 bidi_workaround: Work around buggy terminals without bidirectional text
192 support, using fridibi
193 debug_printtraffic:Print out sent and received HTTP traffic
194 include_ads: Download ads as well
195 default_search: Prepend this string if an input url is not valid.
196 'auto' for elaborate guessing
197 encoding: Use this encoding instead of the system-specified.
198 extract_flat: Do not resolve URLs, return the immediate result.
199 Pass in 'in_playlist' to only show this behavior for
200 playlist items.
201 postprocessors: A list of dictionaries, each with an entry
202 * key: The name of the postprocessor. See
203 youtube_dl/postprocessor/__init__.py for a list.
204 as well as any further keyword arguments for the
205 postprocessor.
206 progress_hooks: A list of functions that get called on download
207 progress, with a dictionary with the entries
208 * status: One of "downloading", "error", or "finished".
209 Check this first and ignore unknown values.
210
211 If status is one of "downloading", or "finished", the
212 following properties may also be present:
213 * filename: The final filename (always present)
214 * tmpfilename: The filename we're currently writing to
215 * downloaded_bytes: Bytes on disk
216 * total_bytes: Size of the whole file, None if unknown
217 * total_bytes_estimate: Guess of the eventual file size,
218 None if unavailable.
219 * elapsed: The number of seconds since download started.
220 * eta: The estimated time in seconds, None if unknown
221 * speed: The download speed in bytes/second, None if
222 unknown
223 * fragment_index: The counter of the currently
224 downloaded video fragment.
225 * fragment_count: The number of fragments (= individual
226 files that will be merged)
227
228 Progress hooks are guaranteed to be called at least once
229 (with status "finished") if the download is successful.
230 merge_output_format: Extension to use when merging formats.
231 fixup: Automatically correct known faults of the file.
232 One of:
233 - "never": do nothing
234 - "warn": only emit a warning
235 - "detect_or_warn": check whether we can do anything
236 about it, warn otherwise (default)
237 source_address: (Experimental) Client-side IP address to bind to.
238 call_home: Boolean, true iff we are allowed to contact the
239 youtube-dl servers for debugging.
240 sleep_interval: Number of seconds to sleep before each download.
241 listformats: Print an overview of available video formats and exit.
242 list_thumbnails: Print a table of all thumbnails and exit.
243 match_filter: A function that gets called with the info_dict of
244 every video.
245 If it returns a message, the video is ignored.
246 If it returns None, the video is downloaded.
247 match_filter_func in utils.py is one example for this.
248 no_color: Do not emit color codes in output.
249
250 The following options determine which downloader is picked:
251 external_downloader: Executable of the external downloader to call.
252 None or unset for standard (built-in) downloader.
253 hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
254
255 The following parameters are not used by YoutubeDL itself, they are used by
256 the downloader (see youtube_dl/downloader/common.py):
257 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
258 noresizebuffer, retries, continuedl, noprogress, consoletitle,
259 xattr_set_filesize, external_downloader_args.
260
261 The following options are used by the post processors:
262 prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
263 otherwise prefer avconv.
264 exec_cmd: Arbitrary command to run after downloading
265 """
266
267 params = None
268 _ies = []
269 _pps = []
270 _download_retcode = None
271 _num_downloads = None
272 _screen_file = None
273
274 def __init__(self, params=None, auto_init=True):
275 """Create a FileDownloader object with the given options."""
276 if params is None:
277 params = {}
278 self._ies = []
279 self._ies_instances = {}
280 self._pps = []
281 self._progress_hooks = []
282 self._download_retcode = 0
283 self._num_downloads = 0
284 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
285 self._err_file = sys.stderr
286 self.params = params
287 self.cache = Cache(self)
288
289 if params.get('bidi_workaround', False):
290 try:
291 import pty
292 master, slave = pty.openpty()
293 width = compat_get_terminal_size().columns
294 if width is None:
295 width_args = []
296 else:
297 width_args = ['-w', str(width)]
298 sp_kwargs = dict(
299 stdin=subprocess.PIPE,
300 stdout=slave,
301 stderr=self._err_file)
302 try:
303 self._output_process = subprocess.Popen(
304 ['bidiv'] + width_args, **sp_kwargs
305 )
306 except OSError:
307 self._output_process = subprocess.Popen(
308 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
309 self._output_channel = os.fdopen(master, 'rb')
310 except OSError as ose:
311 if ose.errno == 2:
312 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
313 else:
314 raise
315
316 if (sys.version_info >= (3,) and sys.platform != 'win32' and
317 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
318 not params.get('restrictfilenames', False)):
319 # On Python 3, the Unicode filesystem API will throw errors (#1474)
320 self.report_warning(
321 'Assuming --restrict-filenames since file system encoding '
322 'cannot encode all characters. '
323 'Set the LC_ALL environment variable to fix this.')
324 self.params['restrictfilenames'] = True
325
326 if isinstance(params.get('outtmpl'), bytes):
327 self.report_warning(
328 'Parameter outtmpl is bytes, but should be a unicode string. '
329 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
330
331 if '%(stitle)s' in self.params.get('outtmpl', ''):
332 self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
333
334 self._setup_opener()
335
336 if auto_init:
337 self.print_debug_header()
338 self.add_default_info_extractors()
339
340 for pp_def_raw in self.params.get('postprocessors', []):
341 pp_class = get_postprocessor(pp_def_raw['key'])
342 pp_def = dict(pp_def_raw)
343 del pp_def['key']
344 pp = pp_class(self, **compat_kwargs(pp_def))
345 self.add_post_processor(pp)
346
347 for ph in self.params.get('progress_hooks', []):
348 self.add_progress_hook(ph)
349
350 def warn_if_short_id(self, argv):
351 # short YouTube ID starting with dash?
352 idxs = [
353 i for i, a in enumerate(argv)
354 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
355 if idxs:
356 correct_argv = (
357 ['youtube-dl'] +
358 [a for i, a in enumerate(argv) if i not in idxs] +
359 ['--'] + [argv[i] for i in idxs]
360 )
361 self.report_warning(
362 'Long argument string detected. '
363 'Use -- to separate parameters and URLs, like this:\n%s\n' %
364 args_to_str(correct_argv))
365
366 def add_info_extractor(self, ie):
367 """Add an InfoExtractor object to the end of the list."""
368 self._ies.append(ie)
369 self._ies_instances[ie.ie_key()] = ie
370 ie.set_downloader(self)
371
372 def get_info_extractor(self, ie_key):
373 """
374 Get an instance of an IE with name ie_key, it will try to get one from
375 the _ies list, if there's no instance it will create a new one and add
376 it to the extractor list.
377 """
378 ie = self._ies_instances.get(ie_key)
379 if ie is None:
380 ie = get_info_extractor(ie_key)()
381 self.add_info_extractor(ie)
382 return ie
383
384 def add_default_info_extractors(self):
385 """
386 Add the InfoExtractors returned by gen_extractors to the end of the list
387 """
388 for ie in gen_extractors():
389 self.add_info_extractor(ie)
390
391 def add_post_processor(self, pp):
392 """Add a PostProcessor object to the end of the chain."""
393 self._pps.append(pp)
394 pp.set_downloader(self)
395
396 def add_progress_hook(self, ph):
397 """Add the progress hook (currently only for the file downloader)"""
398 self._progress_hooks.append(ph)
399
400 def _bidi_workaround(self, message):
401 if not hasattr(self, '_output_channel'):
402 return message
403
404 assert hasattr(self, '_output_process')
405 assert isinstance(message, compat_str)
406 line_count = message.count('\n') + 1
407 self._output_process.stdin.write((message + '\n').encode('utf-8'))
408 self._output_process.stdin.flush()
409 res = ''.join(self._output_channel.readline().decode('utf-8')
410 for _ in range(line_count))
411 return res[:-len('\n')]
412
413 def to_screen(self, message, skip_eol=False):
414 """Print message to stdout if not in quiet mode."""
415 return self.to_stdout(message, skip_eol, check_quiet=True)
416
417 def _write_string(self, s, out=None):
418 write_string(s, out=out, encoding=self.params.get('encoding'))
419
420 def to_stdout(self, message, skip_eol=False, check_quiet=False):
421 """Print message to stdout if not in quiet mode."""
422 if self.params.get('logger'):
423 self.params['logger'].debug(message)
424 elif not check_quiet or not self.params.get('quiet', False):
425 message = self._bidi_workaround(message)
426 terminator = ['\n', ''][skip_eol]
427 output = message + terminator
428
429 self._write_string(output, self._screen_file)
430
431 def to_stderr(self, message):
432 """Print message to stderr."""
433 assert isinstance(message, compat_str)
434 if self.params.get('logger'):
435 self.params['logger'].error(message)
436 else:
437 message = self._bidi_workaround(message)
438 output = message + '\n'
439 self._write_string(output, self._err_file)
440
441 def to_console_title(self, message):
442 if not self.params.get('consoletitle', False):
443 return
444 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
445 # c_wchar_p() might not be necessary if `message` is
446 # already of type unicode()
447 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
448 elif 'TERM' in os.environ:
449 self._write_string('\033]0;%s\007' % message, self._screen_file)
450
451 def save_console_title(self):
452 if not self.params.get('consoletitle', False):
453 return
454 if 'TERM' in os.environ:
455 # Save the title on stack
456 self._write_string('\033[22;0t', self._screen_file)
457
458 def restore_console_title(self):
459 if not self.params.get('consoletitle', False):
460 return
461 if 'TERM' in os.environ:
462 # Restore the title from stack
463 self._write_string('\033[23;0t', self._screen_file)
464
465 def __enter__(self):
466 self.save_console_title()
467 return self
468
469 def __exit__(self, *args):
470 self.restore_console_title()
471
472 if self.params.get('cookiefile') is not None:
473 self.cookiejar.save()
474
475 def trouble(self, message=None, tb=None):
476 """Determine action to take when a download problem appears.
477
478 Depending on if the downloader has been configured to ignore
479 download errors or not, this method may throw an exception or
480 not when errors are found, after printing the message.
481
482 tb, if given, is additional traceback information.
483 """
484 if message is not None:
485 self.to_stderr(message)
486 if self.params.get('verbose'):
487 if tb is None:
488 if sys.exc_info()[0]: # if .trouble has been called from an except block
489 tb = ''
490 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
491 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
492 tb += compat_str(traceback.format_exc())
493 else:
494 tb_data = traceback.format_list(traceback.extract_stack())
495 tb = ''.join(tb_data)
496 self.to_stderr(tb)
497 if not self.params.get('ignoreerrors', False):
498 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
499 exc_info = sys.exc_info()[1].exc_info
500 else:
501 exc_info = sys.exc_info()
502 raise DownloadError(message, exc_info)
503 self._download_retcode = 1
504
505 def report_warning(self, message):
506 '''
507 Print the message to stderr, it will be prefixed with 'WARNING:'
508 If stderr is a tty file the 'WARNING:' will be colored
509 '''
510 if self.params.get('logger') is not None:
511 self.params['logger'].warning(message)
512 else:
513 if self.params.get('no_warnings'):
514 return
515 if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
516 _msg_header = '\033[0;33mWARNING:\033[0m'
517 else:
518 _msg_header = 'WARNING:'
519 warning_message = '%s %s' % (_msg_header, message)
520 self.to_stderr(warning_message)
521
522 def report_error(self, message, tb=None):
523 '''
524 Do the same as trouble, but prefixes the message with 'ERROR:', colored
525 in red if stderr is a tty file.
526 '''
527 if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
528 _msg_header = '\033[0;31mERROR:\033[0m'
529 else:
530 _msg_header = 'ERROR:'
531 error_message = '%s %s' % (_msg_header, message)
532 self.trouble(error_message, tb)
533
534 def report_file_already_downloaded(self, file_name):
535 """Report file has already been fully downloaded."""
536 try:
537 self.to_screen('[download] %s has already been downloaded' % file_name)
538 except UnicodeEncodeError:
539 self.to_screen('[download] The file has already been downloaded')
540
541 def prepare_filename(self, info_dict):
542 """Generate the output filename."""
543 try:
544 template_dict = dict(info_dict)
545
546 template_dict['epoch'] = int(time.time())
547 autonumber_size = self.params.get('autonumber_size')
548 if autonumber_size is None:
549 autonumber_size = 5
550 autonumber_templ = '%0' + str(autonumber_size) + 'd'
551 template_dict['autonumber'] = autonumber_templ % self._num_downloads
552 if template_dict.get('playlist_index') is not None:
553 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
554 if template_dict.get('resolution') is None:
555 if template_dict.get('width') and template_dict.get('height'):
556 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
557 elif template_dict.get('height'):
558 template_dict['resolution'] = '%sp' % template_dict['height']
559 elif template_dict.get('width'):
560 template_dict['resolution'] = '?x%d' % template_dict['width']
561
562 sanitize = lambda k, v: sanitize_filename(
563 compat_str(v),
564 restricted=self.params.get('restrictfilenames'),
565 is_id=(k == 'id'))
566 template_dict = dict((k, sanitize(k, v))
567 for k, v in template_dict.items()
568 if v is not None)
569 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
570
571 outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
572 tmpl = compat_expanduser(outtmpl)
573 filename = tmpl % template_dict
574 # Temporary fix for #4787
575 # 'Treat' all problem characters by passing filename through preferredencoding
576 # to workaround encoding issues with subprocess on python2 @ Windows
577 if sys.version_info < (3, 0) and sys.platform == 'win32':
578 filename = encodeFilename(filename, True).decode(preferredencoding())
579 return filename
580 except ValueError as err:
581 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
582 return None
583
584 def _match_entry(self, info_dict, incomplete):
585 """ Returns None iff the file should be downloaded """
586
587 video_title = info_dict.get('title', info_dict.get('id', 'video'))
588 if 'title' in info_dict:
589 # This can happen when we're just evaluating the playlist
590 title = info_dict['title']
591 matchtitle = self.params.get('matchtitle', False)
592 if matchtitle:
593 if not re.search(matchtitle, title, re.IGNORECASE):
594 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
595 rejecttitle = self.params.get('rejecttitle', False)
596 if rejecttitle:
597 if re.search(rejecttitle, title, re.IGNORECASE):
598 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
599 date = info_dict.get('upload_date', None)
600 if date is not None:
601 dateRange = self.params.get('daterange', DateRange())
602 if date not in dateRange:
603 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
604 view_count = info_dict.get('view_count', None)
605 if view_count is not None:
606 min_views = self.params.get('min_views')
607 if min_views is not None and view_count < min_views:
608 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
609 max_views = self.params.get('max_views')
610 if max_views is not None and view_count > max_views:
611 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
612 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
613 return 'Skipping "%s" because it is age restricted' % video_title
614 if self.in_download_archive(info_dict):
615 return '%s has already been recorded in archive' % video_title
616
617 if not incomplete:
618 match_filter = self.params.get('match_filter')
619 if match_filter is not None:
620 ret = match_filter(info_dict)
621 if ret is not None:
622 return ret
623
624 return None
625
626 @staticmethod
627 def add_extra_info(info_dict, extra_info):
628 '''Set the keys from extra_info in info dict if they are missing'''
629 for key, value in extra_info.items():
630 info_dict.setdefault(key, value)
631
632 def extract_info(self, url, download=True, ie_key=None, extra_info={},
633 process=True):
634 '''
635 Returns a list with a dictionary for each video we find.
636 If 'download', also downloads the videos.
637 extra_info is a dict containing the extra values to add to each result
638 '''
639
640 if ie_key:
641 ies = [self.get_info_extractor(ie_key)]
642 else:
643 ies = self._ies
644
645 for ie in ies:
646 if not ie.suitable(url):
647 continue
648
649 if not ie.working():
650 self.report_warning('The program functionality for this site has been marked as broken, '
651 'and will probably not work.')
652
653 try:
654 ie_result = ie.extract(url)
655 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
656 break
657 if isinstance(ie_result, list):
658 # Backwards compatibility: old IE result format
659 ie_result = {
660 '_type': 'compat_list',
661 'entries': ie_result,
662 }
663 self.add_default_extra_info(ie_result, ie, url)
664 if process:
665 return self.process_ie_result(ie_result, download, extra_info)
666 else:
667 return ie_result
668 except ExtractorError as de: # An error we somewhat expected
669 self.report_error(compat_str(de), de.format_traceback())
670 break
671 except MaxDownloadsReached:
672 raise
673 except Exception as e:
674 if self.params.get('ignoreerrors', False):
675 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
676 break
677 else:
678 raise
679 else:
680 self.report_error('no suitable InfoExtractor for URL %s' % url)
681
682 def add_default_extra_info(self, ie_result, ie, url):
683 self.add_extra_info(ie_result, {
684 'extractor': ie.IE_NAME,
685 'webpage_url': url,
686 'webpage_url_basename': url_basename(url),
687 'extractor_key': ie.ie_key(),
688 })
689
690 def process_ie_result(self, ie_result, download=True, extra_info={}):
691 """
692 Take the result of the ie(may be modified) and resolve all unresolved
693 references (URLs, playlist items).
694
695 It will also download the videos if 'download'.
696 Returns the resolved ie_result.
697 """
698
699 result_type = ie_result.get('_type', 'video')
700
701 if result_type in ('url', 'url_transparent'):
702 extract_flat = self.params.get('extract_flat', False)
703 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
704 extract_flat is True):
705 if self.params.get('forcejson', False):
706 self.to_stdout(json.dumps(ie_result))
707 return ie_result
708
709 if result_type == 'video':
710 self.add_extra_info(ie_result, extra_info)
711 return self.process_video_result(ie_result, download=download)
712 elif result_type == 'url':
713 # We have to add extra_info to the results because it may be
714 # contained in a playlist
715 return self.extract_info(ie_result['url'],
716 download,
717 ie_key=ie_result.get('ie_key'),
718 extra_info=extra_info)
719 elif result_type == 'url_transparent':
720 # Use the information from the embedding page
721 info = self.extract_info(
722 ie_result['url'], ie_key=ie_result.get('ie_key'),
723 extra_info=extra_info, download=False, process=False)
724
725 force_properties = dict(
726 (k, v) for k, v in ie_result.items() if v is not None)
727 for f in ('_type', 'url'):
728 if f in force_properties:
729 del force_properties[f]
730 new_result = info.copy()
731 new_result.update(force_properties)
732
733 assert new_result.get('_type') != 'url_transparent'
734
735 return self.process_ie_result(
736 new_result, download=download, extra_info=extra_info)
737 elif result_type == 'playlist' or result_type == 'multi_video':
738 # We process each entry in the playlist
739 playlist = ie_result.get('title', None) or ie_result.get('id', None)
740 self.to_screen('[download] Downloading playlist: %s' % playlist)
741
742 playlist_results = []
743
744 playliststart = self.params.get('playliststart', 1) - 1
745 playlistend = self.params.get('playlistend', None)
746 # For backwards compatibility, interpret -1 as whole list
747 if playlistend == -1:
748 playlistend = None
749
750 playlistitems_str = self.params.get('playlist_items', None)
751 playlistitems = None
752 if playlistitems_str is not None:
753 def iter_playlistitems(format):
754 for string_segment in format.split(','):
755 if '-' in string_segment:
756 start, end = string_segment.split('-')
757 for item in range(int(start), int(end) + 1):
758 yield int(item)
759 else:
760 yield int(string_segment)
761 playlistitems = iter_playlistitems(playlistitems_str)
762
763 ie_entries = ie_result['entries']
764 if isinstance(ie_entries, list):
765 n_all_entries = len(ie_entries)
766 if playlistitems:
767 entries = [ie_entries[i - 1] for i in playlistitems]
768 else:
769 entries = ie_entries[playliststart:playlistend]
770 n_entries = len(entries)
771 self.to_screen(
772 "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
773 (ie_result['extractor'], playlist, n_all_entries, n_entries))
774 elif isinstance(ie_entries, PagedList):
775 if playlistitems:
776 entries = []
777 for item in playlistitems:
778 entries.extend(ie_entries.getslice(
779 item - 1, item
780 ))
781 else:
782 entries = ie_entries.getslice(
783 playliststart, playlistend)
784 n_entries = len(entries)
785 self.to_screen(
786 "[%s] playlist %s: Downloading %d videos" %
787 (ie_result['extractor'], playlist, n_entries))
788 else: # iterable
789 if playlistitems:
790 entry_list = list(ie_entries)
791 entries = [entry_list[i - 1] for i in playlistitems]
792 else:
793 entries = list(itertools.islice(
794 ie_entries, playliststart, playlistend))
795 n_entries = len(entries)
796 self.to_screen(
797 "[%s] playlist %s: Downloading %d videos" %
798 (ie_result['extractor'], playlist, n_entries))
799
800 if self.params.get('playlistreverse', False):
801 entries = entries[::-1]
802
803 for i, entry in enumerate(entries, 1):
804 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
805 extra = {
806 'n_entries': n_entries,
807 'playlist': playlist,
808 'playlist_id': ie_result.get('id'),
809 'playlist_title': ie_result.get('title'),
810 'playlist_index': i + playliststart,
811 'extractor': ie_result['extractor'],
812 'webpage_url': ie_result['webpage_url'],
813 'webpage_url_basename': url_basename(ie_result['webpage_url']),
814 'extractor_key': ie_result['extractor_key'],
815 }
816
817 reason = self._match_entry(entry, incomplete=True)
818 if reason is not None:
819 self.to_screen('[download] ' + reason)
820 continue
821
822 entry_result = self.process_ie_result(entry,
823 download=download,
824 extra_info=extra)
825 playlist_results.append(entry_result)
826 ie_result['entries'] = playlist_results
827 return ie_result
828 elif result_type == 'compat_list':
829 self.report_warning(
830 'Extractor %s returned a compat_list result. '
831 'It needs to be updated.' % ie_result.get('extractor'))
832
833 def _fixup(r):
834 self.add_extra_info(
835 r,
836 {
837 'extractor': ie_result['extractor'],
838 'webpage_url': ie_result['webpage_url'],
839 'webpage_url_basename': url_basename(ie_result['webpage_url']),
840 'extractor_key': ie_result['extractor_key'],
841 }
842 )
843 return r
844 ie_result['entries'] = [
845 self.process_ie_result(_fixup(r), download, extra_info)
846 for r in ie_result['entries']
847 ]
848 return ie_result
849 else:
850 raise Exception('Invalid result type: %s' % result_type)
851
852 def _apply_format_filter(self, format_spec, available_formats):
853 " Returns a tuple of the remaining format_spec and filtered formats "
854
855 OPERATORS = {
856 '<': operator.lt,
857 '<=': operator.le,
858 '>': operator.gt,
859 '>=': operator.ge,
860 '=': operator.eq,
861 '!=': operator.ne,
862 }
863 operator_rex = re.compile(r'''(?x)\s*\[
864 (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
865 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
866 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
867 \]$
868 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
869 m = operator_rex.search(format_spec)
870 if m:
871 try:
872 comparison_value = int(m.group('value'))
873 except ValueError:
874 comparison_value = parse_filesize(m.group('value'))
875 if comparison_value is None:
876 comparison_value = parse_filesize(m.group('value') + 'B')
877 if comparison_value is None:
878 raise ValueError(
879 'Invalid value %r in format specification %r' % (
880 m.group('value'), format_spec))
881 op = OPERATORS[m.group('op')]
882
883 if not m:
884 STR_OPERATORS = {
885 '=': operator.eq,
886 '!=': operator.ne,
887 }
888 str_operator_rex = re.compile(r'''(?x)\s*\[
889 \s*(?P<key>ext|acodec|vcodec|container|protocol)
890 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
891 \s*(?P<value>[a-zA-Z0-9_-]+)
892 \s*\]$
893 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
894 m = str_operator_rex.search(format_spec)
895 if m:
896 comparison_value = m.group('value')
897 op = STR_OPERATORS[m.group('op')]
898
899 if not m:
900 raise ValueError('Invalid format specification %r' % format_spec)
901
902 def _filter(f):
903 actual_value = f.get(m.group('key'))
904 if actual_value is None:
905 return m.group('none_inclusive')
906 return op(actual_value, comparison_value)
907 new_formats = [f for f in available_formats if _filter(f)]
908
909 new_format_spec = format_spec[:-len(m.group(0))]
910 if not new_format_spec:
911 new_format_spec = 'best'
912
913 return (new_format_spec, new_formats)
914
915 def select_format(self, format_spec, available_formats):
916 while format_spec.endswith(']'):
917 format_spec, available_formats = self._apply_format_filter(
918 format_spec, available_formats)
919 if not available_formats:
920 return None
921
922 if format_spec == 'best' or format_spec is None:
923 return available_formats[-1]
924 elif format_spec == 'worst':
925 return available_formats[0]
926 elif format_spec == 'bestaudio':
927 audio_formats = [
928 f for f in available_formats
929 if f.get('vcodec') == 'none']
930 if audio_formats:
931 return audio_formats[-1]
932 elif format_spec == 'worstaudio':
933 audio_formats = [
934 f for f in available_formats
935 if f.get('vcodec') == 'none']
936 if audio_formats:
937 return audio_formats[0]
938 elif format_spec == 'bestvideo':
939 video_formats = [
940 f for f in available_formats
941 if f.get('acodec') == 'none']
942 if video_formats:
943 return video_formats[-1]
944 elif format_spec == 'worstvideo':
945 video_formats = [
946 f for f in available_formats
947 if f.get('acodec') == 'none']
948 if video_formats:
949 return video_formats[0]
950 else:
951 extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
952 if format_spec in extensions:
953 filter_f = lambda f: f['ext'] == format_spec
954 else:
955 filter_f = lambda f: f['format_id'] == format_spec
956 matches = list(filter(filter_f, available_formats))
957 if matches:
958 return matches[-1]
959 return None
960
961 def _calc_headers(self, info_dict):
962 res = std_headers.copy()
963
964 add_headers = info_dict.get('http_headers')
965 if add_headers:
966 res.update(add_headers)
967
968 cookies = self._calc_cookies(info_dict)
969 if cookies:
970 res['Cookie'] = cookies
971
972 return res
973
974 def _calc_cookies(self, info_dict):
975 pr = compat_urllib_request.Request(info_dict['url'])
976 self.cookiejar.add_cookie_header(pr)
977 return pr.get_header('Cookie')
978
979 def process_video_result(self, info_dict, download=True):
980 assert info_dict.get('_type', 'video') == 'video'
981
982 if 'id' not in info_dict:
983 raise ExtractorError('Missing "id" field in extractor result')
984 if 'title' not in info_dict:
985 raise ExtractorError('Missing "title" field in extractor result')
986
987 if 'playlist' not in info_dict:
988 # It isn't part of a playlist
989 info_dict['playlist'] = None
990 info_dict['playlist_index'] = None
991
992 thumbnails = info_dict.get('thumbnails')
993 if thumbnails is None:
994 thumbnail = info_dict.get('thumbnail')
995 if thumbnail:
996 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
997 if thumbnails:
998 thumbnails.sort(key=lambda t: (
999 t.get('preference'), t.get('width'), t.get('height'),
1000 t.get('id'), t.get('url')))
1001 for i, t in enumerate(thumbnails):
1002 if 'width' in t and 'height' in t:
1003 t['resolution'] = '%dx%d' % (t['width'], t['height'])
1004 if t.get('id') is None:
1005 t['id'] = '%d' % i
1006
1007 if thumbnails and 'thumbnail' not in info_dict:
1008 info_dict['thumbnail'] = thumbnails[-1]['url']
1009
1010 if 'display_id' not in info_dict and 'id' in info_dict:
1011 info_dict['display_id'] = info_dict['id']
1012
1013 if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1014 # Working around negative timestamps in Windows
1015 # (see http://bugs.python.org/issue1646728)
1016 if info_dict['timestamp'] < 0 and os.name == 'nt':
1017 info_dict['timestamp'] = 0
1018 upload_date = datetime.datetime.utcfromtimestamp(
1019 info_dict['timestamp'])
1020 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1021
1022 if self.params.get('listsubtitles', False):
1023 if 'automatic_captions' in info_dict:
1024 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1025 self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
1026 return
1027 info_dict['requested_subtitles'] = self.process_subtitles(
1028 info_dict['id'], info_dict.get('subtitles'),
1029 info_dict.get('automatic_captions'))
1030
1031 # This extractors handle format selection themselves
1032 if info_dict['extractor'] in ['Youku']:
1033 if download:
1034 self.process_info(info_dict)
1035 return info_dict
1036
1037 # We now pick which formats have to be downloaded
1038 if info_dict.get('formats') is None:
1039 # There's only one format available
1040 formats = [info_dict]
1041 else:
1042 formats = info_dict['formats']
1043
1044 if not formats:
1045 raise ExtractorError('No video formats found!')
1046
1047 # We check that all the formats have the format and format_id fields
1048 for i, format in enumerate(formats):
1049 if 'url' not in format:
1050 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1051
1052 if format.get('format_id') is None:
1053 format['format_id'] = compat_str(i)
1054 if format.get('format') is None:
1055 format['format'] = '{id} - {res}{note}'.format(
1056 id=format['format_id'],
1057 res=self.format_resolution(format),
1058 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1059 )
1060 # Automatically determine file extension if missing
1061 if 'ext' not in format:
1062 format['ext'] = determine_ext(format['url']).lower()
1063 # Add HTTP headers, so that external programs can use them from the
1064 # json output
1065 full_format_info = info_dict.copy()
1066 full_format_info.update(format)
1067 format['http_headers'] = self._calc_headers(full_format_info)
1068
1069 format_limit = self.params.get('format_limit', None)
1070 if format_limit:
1071 formats = list(takewhile_inclusive(
1072 lambda f: f['format_id'] != format_limit, formats
1073 ))
1074
1075 # TODO Central sorting goes here
1076
1077 if formats[0] is not info_dict:
1078 # only set the 'formats' fields if the original info_dict list them
1079 # otherwise we end up with a circular reference, the first (and unique)
1080 # element in the 'formats' field in info_dict is info_dict itself,
1081 # wich can't be exported to json
1082 info_dict['formats'] = formats
1083 if self.params.get('listformats'):
1084 self.list_formats(info_dict)
1085 return
1086 if self.params.get('list_thumbnails'):
1087 self.list_thumbnails(info_dict)
1088 return
1089
1090 req_format = self.params.get('format')
1091 if req_format is None:
1092 req_format = 'best'
1093 formats_to_download = []
1094 if req_format == 'all':
1095 formats_to_download = formats
1096 else:
1097 for rfstr in req_format.split(','):
1098 # We can accept formats requested in the format: 34/5/best, we pick
1099 # the first that is available, starting from left
1100 req_formats = rfstr.split('/')
1101 for rf in req_formats:
1102 if re.match(r'.+?\+.+?', rf) is not None:
1103 # Two formats have been requested like '137+139'
1104 format_1, format_2 = rf.split('+')
1105 formats_info = (self.select_format(format_1, formats),
1106 self.select_format(format_2, formats))
1107 if all(formats_info):
1108 # The first format must contain the video and the
1109 # second the audio
1110 if formats_info[0].get('vcodec') == 'none':
1111 self.report_error('The first format must '
1112 'contain the video, try using '
1113 '"-f %s+%s"' % (format_2, format_1))
1114 return
1115 output_ext = (
1116 formats_info[0]['ext']
1117 if self.params.get('merge_output_format') is None
1118 else self.params['merge_output_format'])
1119 selected_format = {
1120 'requested_formats': formats_info,
1121 'format': '%s+%s' % (formats_info[0].get('format'),
1122 formats_info[1].get('format')),
1123 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1124 formats_info[1].get('format_id')),
1125 'width': formats_info[0].get('width'),
1126 'height': formats_info[0].get('height'),
1127 'resolution': formats_info[0].get('resolution'),
1128 'fps': formats_info[0].get('fps'),
1129 'vcodec': formats_info[0].get('vcodec'),
1130 'vbr': formats_info[0].get('vbr'),
1131 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1132 'acodec': formats_info[1].get('acodec'),
1133 'abr': formats_info[1].get('abr'),
1134 'ext': output_ext,
1135 }
1136 else:
1137 selected_format = None
1138 else:
1139 selected_format = self.select_format(rf, formats)
1140 if selected_format is not None:
1141 formats_to_download.append(selected_format)
1142 break
1143 if not formats_to_download:
1144 raise ExtractorError('requested format not available',
1145 expected=True)
1146
1147 if download:
1148 if len(formats_to_download) > 1:
1149 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1150 for format in formats_to_download:
1151 new_info = dict(info_dict)
1152 new_info.update(format)
1153 self.process_info(new_info)
1154 # We update the info dict with the best quality format (backwards compatibility)
1155 info_dict.update(formats_to_download[-1])
1156 return info_dict
1157
1158 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1159 """Select the requested subtitles and their format"""
1160 available_subs = {}
1161 if normal_subtitles and self.params.get('writesubtitles'):
1162 available_subs.update(normal_subtitles)
1163 if automatic_captions and self.params.get('writeautomaticsub'):
1164 for lang, cap_info in automatic_captions.items():
1165 if lang not in available_subs:
1166 available_subs[lang] = cap_info
1167
1168 if (not self.params.get('writesubtitles') and not
1169 self.params.get('writeautomaticsub') or not
1170 available_subs):
1171 return None
1172
1173 if self.params.get('allsubtitles', False):
1174 requested_langs = available_subs.keys()
1175 else:
1176 if self.params.get('subtitleslangs', False):
1177 requested_langs = self.params.get('subtitleslangs')
1178 elif 'en' in available_subs:
1179 requested_langs = ['en']
1180 else:
1181 requested_langs = [list(available_subs.keys())[0]]
1182
1183 formats_query = self.params.get('subtitlesformat', 'best')
1184 formats_preference = formats_query.split('/') if formats_query else []
1185 subs = {}
1186 for lang in requested_langs:
1187 formats = available_subs.get(lang)
1188 if formats is None:
1189 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1190 continue
1191 for ext in formats_preference:
1192 if ext == 'best':
1193 f = formats[-1]
1194 break
1195 matches = list(filter(lambda f: f['ext'] == ext, formats))
1196 if matches:
1197 f = matches[-1]
1198 break
1199 else:
1200 f = formats[-1]
1201 self.report_warning(
1202 'No subtitle format found matching "%s" for language %s, '
1203 'using %s' % (formats_query, lang, f['ext']))
1204 subs[lang] = f
1205 return subs
1206
1207 def process_info(self, info_dict):
1208 """Process a single resolved IE result."""
1209
1210 assert info_dict.get('_type', 'video') == 'video'
1211
1212 max_downloads = self.params.get('max_downloads')
1213 if max_downloads is not None:
1214 if self._num_downloads >= int(max_downloads):
1215 raise MaxDownloadsReached()
1216
1217 info_dict['fulltitle'] = info_dict['title']
1218 if len(info_dict['title']) > 200:
1219 info_dict['title'] = info_dict['title'][:197] + '...'
1220
1221 # Keep for backwards compatibility
1222 info_dict['stitle'] = info_dict['title']
1223
1224 if 'format' not in info_dict:
1225 info_dict['format'] = info_dict['ext']
1226
1227 reason = self._match_entry(info_dict, incomplete=False)
1228 if reason is not None:
1229 self.to_screen('[download] ' + reason)
1230 return
1231
1232 self._num_downloads += 1
1233
1234 info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1235
1236 # Forced printings
1237 if self.params.get('forcetitle', False):
1238 self.to_stdout(info_dict['fulltitle'])
1239 if self.params.get('forceid', False):
1240 self.to_stdout(info_dict['id'])
1241 if self.params.get('forceurl', False):
1242 if info_dict.get('requested_formats') is not None:
1243 for f in info_dict['requested_formats']:
1244 self.to_stdout(f['url'] + f.get('play_path', ''))
1245 else:
1246 # For RTMP URLs, also include the playpath
1247 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1248 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1249 self.to_stdout(info_dict['thumbnail'])
1250 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1251 self.to_stdout(info_dict['description'])
1252 if self.params.get('forcefilename', False) and filename is not None:
1253 self.to_stdout(filename)
1254 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1255 self.to_stdout(formatSeconds(info_dict['duration']))
1256 if self.params.get('forceformat', False):
1257 self.to_stdout(info_dict['format'])
1258 if self.params.get('forcejson', False):
1259 self.to_stdout(json.dumps(info_dict))
1260
1261 # Do nothing else if in simulate mode
1262 if self.params.get('simulate', False):
1263 return
1264
1265 if filename is None:
1266 return
1267
1268 try:
1269 dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1270 if dn and not os.path.exists(dn):
1271 os.makedirs(dn)
1272 except (OSError, IOError) as err:
1273 self.report_error('unable to create directory ' + compat_str(err))
1274 return
1275
1276 if self.params.get('writedescription', False):
1277 descfn = filename + '.description'
1278 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1279 self.to_screen('[info] Video description is already present')
1280 elif info_dict.get('description') is None:
1281 self.report_warning('There\'s no description to write.')
1282 else:
1283 try:
1284 self.to_screen('[info] Writing video description to: ' + descfn)
1285 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1286 descfile.write(info_dict['description'])
1287 except (OSError, IOError):
1288 self.report_error('Cannot write description file ' + descfn)
1289 return
1290
1291 if self.params.get('writeannotations', False):
1292 annofn = filename + '.annotations.xml'
1293 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1294 self.to_screen('[info] Video annotations are already present')
1295 else:
1296 try:
1297 self.to_screen('[info] Writing video annotations to: ' + annofn)
1298 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1299 annofile.write(info_dict['annotations'])
1300 except (KeyError, TypeError):
1301 self.report_warning('There are no annotations to write.')
1302 except (OSError, IOError):
1303 self.report_error('Cannot write annotations file: ' + annofn)
1304 return
1305
1306 subtitles_are_requested = any([self.params.get('writesubtitles', False),
1307 self.params.get('writeautomaticsub')])
1308
1309 if subtitles_are_requested and info_dict.get('requested_subtitles'):
1310 # subtitles download errors are already managed as troubles in relevant IE
1311 # that way it will silently go on when used with unsupporting IE
1312 subtitles = info_dict['requested_subtitles']
1313 ie = self.get_info_extractor(info_dict['extractor_key'])
1314 for sub_lang, sub_info in subtitles.items():
1315 sub_format = sub_info['ext']
1316 if sub_info.get('data') is not None:
1317 sub_data = sub_info['data']
1318 else:
1319 try:
1320 sub_data = ie._download_webpage(
1321 sub_info['url'], info_dict['id'], note=False)
1322 except ExtractorError as err:
1323 self.report_warning('Unable to download subtitle for "%s": %s' %
1324 (sub_lang, compat_str(err.cause)))
1325 continue
1326 try:
1327 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1328 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1329 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1330 else:
1331 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1332 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1333 subfile.write(sub_data)
1334 except (OSError, IOError):
1335 self.report_error('Cannot write subtitles file ' + sub_filename)
1336 return
1337
1338 if self.params.get('writeinfojson', False):
1339 infofn = os.path.splitext(filename)[0] + '.info.json'
1340 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1341 self.to_screen('[info] Video description metadata is already present')
1342 else:
1343 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1344 try:
1345 write_json_file(info_dict, infofn)
1346 except (OSError, IOError):
1347 self.report_error('Cannot write metadata to JSON file ' + infofn)
1348 return
1349
1350 self._write_thumbnails(info_dict, filename)
1351
1352 if not self.params.get('skip_download', False):
1353 try:
1354 def dl(name, info):
1355 fd = get_suitable_downloader(info, self.params)(self, self.params)
1356 for ph in self._progress_hooks:
1357 fd.add_progress_hook(ph)
1358 if self.params.get('verbose'):
1359 self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1360 return fd.download(name, info)
1361
1362 if info_dict.get('requested_formats') is not None:
1363 downloaded = []
1364 success = True
1365 merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
1366 if not merger.available:
1367 postprocessors = []
1368 self.report_warning('You have requested multiple '
1369 'formats but ffmpeg or avconv are not installed.'
1370 ' The formats won\'t be merged')
1371 else:
1372 postprocessors = [merger]
1373 for f in info_dict['requested_formats']:
1374 new_info = dict(info_dict)
1375 new_info.update(f)
1376 fname = self.prepare_filename(new_info)
1377 fname = prepend_extension(fname, 'f%s' % f['format_id'])
1378 downloaded.append(fname)
1379 partial_success = dl(fname, new_info)
1380 success = success and partial_success
1381 info_dict['__postprocessors'] = postprocessors
1382 info_dict['__files_to_merge'] = downloaded
1383 else:
1384 # Just a single file
1385 success = dl(filename, info_dict)
1386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1387 self.report_error('unable to download video data: %s' % str(err))
1388 return
1389 except (OSError, IOError) as err:
1390 raise UnavailableVideoError(err)
1391 except (ContentTooShortError, ) as err:
1392 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1393 return
1394
1395 if success:
1396 # Fixup content
1397 fixup_policy = self.params.get('fixup')
1398 if fixup_policy is None:
1399 fixup_policy = 'detect_or_warn'
1400
1401 stretched_ratio = info_dict.get('stretched_ratio')
1402 if stretched_ratio is not None and stretched_ratio != 1:
1403 if fixup_policy == 'warn':
1404 self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1405 info_dict['id'], stretched_ratio))
1406 elif fixup_policy == 'detect_or_warn':
1407 stretched_pp = FFmpegFixupStretchedPP(self)
1408 if stretched_pp.available:
1409 info_dict.setdefault('__postprocessors', [])
1410 info_dict['__postprocessors'].append(stretched_pp)
1411 else:
1412 self.report_warning(
1413 '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
1414 info_dict['id'], stretched_ratio))
1415 else:
1416 assert fixup_policy in ('ignore', 'never')
1417
1418 if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
1419 if fixup_policy == 'warn':
1420 self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
1421 info_dict['id']))
1422 elif fixup_policy == 'detect_or_warn':
1423 fixup_pp = FFmpegFixupM4aPP(self)
1424 if fixup_pp.available:
1425 info_dict.setdefault('__postprocessors', [])
1426 info_dict['__postprocessors'].append(fixup_pp)
1427 else:
1428 self.report_warning(
1429 '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
1430 info_dict['id']))
1431 else:
1432 assert fixup_policy in ('ignore', 'never')
1433
1434 try:
1435 self.post_process(filename, info_dict)
1436 except (PostProcessingError) as err:
1437 self.report_error('postprocessing: %s' % str(err))
1438 return
1439 self.record_download_archive(info_dict)
1440
1441 def download(self, url_list):
1442 """Download a given list of URLs."""
1443 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1444 if (len(url_list) > 1 and
1445 '%' not in outtmpl and
1446 self.params.get('max_downloads') != 1):
1447 raise SameFileError(outtmpl)
1448
1449 for url in url_list:
1450 try:
1451 # It also downloads the videos
1452 res = self.extract_info(url)
1453 except UnavailableVideoError:
1454 self.report_error('unable to download video')
1455 except MaxDownloadsReached:
1456 self.to_screen('[info] Maximum number of downloaded files reached.')
1457 raise
1458 else:
1459 if self.params.get('dump_single_json', False):
1460 self.to_stdout(json.dumps(res))
1461
1462 return self._download_retcode
1463
1464 def download_with_info_file(self, info_filename):
1465 with contextlib.closing(fileinput.FileInput(
1466 [info_filename], mode='r',
1467 openhook=fileinput.hook_encoded('utf-8'))) as f:
1468 # FileInput doesn't have a read method, we can't call json.load
1469 info = json.loads('\n'.join(f))
1470 try:
1471 self.process_ie_result(info, download=True)
1472 except DownloadError:
1473 webpage_url = info.get('webpage_url')
1474 if webpage_url is not None:
1475 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1476 return self.download([webpage_url])
1477 else:
1478 raise
1479 return self._download_retcode
1480
1481 def post_process(self, filename, ie_info):
1482 """Run all the postprocessors on the given file."""
1483 info = dict(ie_info)
1484 info['filepath'] = filename
1485 pps_chain = []
1486 if ie_info.get('__postprocessors') is not None:
1487 pps_chain.extend(ie_info['__postprocessors'])
1488 pps_chain.extend(self._pps)
1489 for pp in pps_chain:
1490 keep_video = None
1491 old_filename = info['filepath']
1492 try:
1493 keep_video_wish, info = pp.run(info)
1494 if keep_video_wish is not None:
1495 if keep_video_wish:
1496 keep_video = keep_video_wish
1497 elif keep_video is None:
1498 # No clear decision yet, let IE decide
1499 keep_video = keep_video_wish
1500 except PostProcessingError as e:
1501 self.report_error(e.msg)
1502 if keep_video is False and not self.params.get('keepvideo', False):
1503 try:
1504 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1505 os.remove(encodeFilename(old_filename))
1506 except (IOError, OSError):
1507 self.report_warning('Unable to remove downloaded video file')
1508
1509 def _make_archive_id(self, info_dict):
1510 # Future-proof against any change in case
1511 # and backwards compatibility with prior versions
1512 extractor = info_dict.get('extractor_key')
1513 if extractor is None:
1514 if 'id' in info_dict:
1515 extractor = info_dict.get('ie_key') # key in a playlist
1516 if extractor is None:
1517 return None # Incomplete video information
1518 return extractor.lower() + ' ' + info_dict['id']
1519
1520 def in_download_archive(self, info_dict):
1521 fn = self.params.get('download_archive')
1522 if fn is None:
1523 return False
1524
1525 vid_id = self._make_archive_id(info_dict)
1526 if vid_id is None:
1527 return False # Incomplete video information
1528
1529 try:
1530 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1531 for line in archive_file:
1532 if line.strip() == vid_id:
1533 return True
1534 except IOError as ioe:
1535 if ioe.errno != errno.ENOENT:
1536 raise
1537 return False
1538
1539 def record_download_archive(self, info_dict):
1540 fn = self.params.get('download_archive')
1541 if fn is None:
1542 return
1543 vid_id = self._make_archive_id(info_dict)
1544 assert vid_id
1545 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1546 archive_file.write(vid_id + '\n')
1547
1548 @staticmethod
1549 def format_resolution(format, default='unknown'):
1550 if format.get('vcodec') == 'none':
1551 return 'audio only'
1552 if format.get('resolution') is not None:
1553 return format['resolution']
1554 if format.get('height') is not None:
1555 if format.get('width') is not None:
1556 res = '%sx%s' % (format['width'], format['height'])
1557 else:
1558 res = '%sp' % format['height']
1559 elif format.get('width') is not None:
1560 res = '?x%d' % format['width']
1561 else:
1562 res = default
1563 return res
1564
1565 def _format_note(self, fdict):
1566 res = ''
1567 if fdict.get('ext') in ['f4f', 'f4m']:
1568 res += '(unsupported) '
1569 if fdict.get('format_note') is not None:
1570 res += fdict['format_note'] + ' '
1571 if fdict.get('tbr') is not None:
1572 res += '%4dk ' % fdict['tbr']
1573 if fdict.get('container') is not None:
1574 if res:
1575 res += ', '
1576 res += '%s container' % fdict['container']
1577 if (fdict.get('vcodec') is not None and
1578 fdict.get('vcodec') != 'none'):
1579 if res:
1580 res += ', '
1581 res += fdict['vcodec']
1582 if fdict.get('vbr') is not None:
1583 res += '@'
1584 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1585 res += 'video@'
1586 if fdict.get('vbr') is not None:
1587 res += '%4dk' % fdict['vbr']
1588 if fdict.get('fps') is not None:
1589 res += ', %sfps' % fdict['fps']
1590 if fdict.get('acodec') is not None:
1591 if res:
1592 res += ', '
1593 if fdict['acodec'] == 'none':
1594 res += 'video only'
1595 else:
1596 res += '%-5s' % fdict['acodec']
1597 elif fdict.get('abr') is not None:
1598 if res:
1599 res += ', '
1600 res += 'audio'
1601 if fdict.get('abr') is not None:
1602 res += '@%3dk' % fdict['abr']
1603 if fdict.get('asr') is not None:
1604 res += ' (%5dHz)' % fdict['asr']
1605 if fdict.get('filesize') is not None:
1606 if res:
1607 res += ', '
1608 res += format_bytes(fdict['filesize'])
1609 elif fdict.get('filesize_approx') is not None:
1610 if res:
1611 res += ', '
1612 res += '~' + format_bytes(fdict['filesize_approx'])
1613 return res
1614
1615 def list_formats(self, info_dict):
1616 formats = info_dict.get('formats', [info_dict])
1617 table = [
1618 [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1619 for f in formats
1620 if f.get('preference') is None or f['preference'] >= -1000]
1621 if len(formats) > 1:
1622 table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1623
1624 header_line = ['format code', 'extension', 'resolution', 'note']
1625 self.to_screen(
1626 '[info] Available formats for %s:\n%s' %
1627 (info_dict['id'], render_table(header_line, table)))
1628
1629 def list_thumbnails(self, info_dict):
1630 thumbnails = info_dict.get('thumbnails')
1631 if not thumbnails:
1632 tn_url = info_dict.get('thumbnail')
1633 if tn_url:
1634 thumbnails = [{'id': '0', 'url': tn_url}]
1635 else:
1636 self.to_screen(
1637 '[info] No thumbnails present for %s' % info_dict['id'])
1638 return
1639
1640 self.to_screen(
1641 '[info] Thumbnails for %s:' % info_dict['id'])
1642 self.to_screen(render_table(
1643 ['ID', 'width', 'height', 'URL'],
1644 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1645
1646 def list_subtitles(self, video_id, subtitles, name='subtitles'):
1647 if not subtitles:
1648 self.to_screen('%s has no %s' % (video_id, name))
1649 return
1650 self.to_screen(
1651 'Available %s for %s:' % (name, video_id))
1652 self.to_screen(render_table(
1653 ['Language', 'formats'],
1654 [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1655 for lang, formats in subtitles.items()]))
1656
1657 def urlopen(self, req):
1658 """ Start an HTTP download """
1659
1660 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1661 # always respected by websites, some tend to give out URLs with non percent-encoded
1662 # non-ASCII characters (see telemb.py, ard.py [#3412])
1663 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1664 # To work around aforementioned issue we will replace request's original URL with
1665 # percent-encoded one
1666 req_is_string = isinstance(req, compat_basestring)
1667 url = req if req_is_string else req.get_full_url()
1668 url_escaped = escape_url(url)
1669
1670 # Substitute URL if any change after escaping
1671 if url != url_escaped:
1672 if req_is_string:
1673 req = url_escaped
1674 else:
1675 req = compat_urllib_request.Request(
1676 url_escaped, data=req.data, headers=req.headers,
1677 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1678
1679 return self._opener.open(req, timeout=self._socket_timeout)
1680
1681 def print_debug_header(self):
1682 if not self.params.get('verbose'):
1683 return
1684
1685 if type('') is not compat_str:
1686 # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1687 self.report_warning(
1688 'Your Python is broken! Update to a newer and supported version')
1689
1690 stdout_encoding = getattr(
1691 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1692 encoding_str = (
1693 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1694 locale.getpreferredencoding(),
1695 sys.getfilesystemencoding(),
1696 stdout_encoding,
1697 self.get_encoding()))
1698 write_string(encoding_str, encoding=None)
1699
1700 self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1701 try:
1702 sp = subprocess.Popen(
1703 ['git', 'rev-parse', '--short', 'HEAD'],
1704 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1705 cwd=os.path.dirname(os.path.abspath(__file__)))
1706 out, err = sp.communicate()
1707 out = out.decode().strip()
1708 if re.match('[0-9a-f]+', out):
1709 self._write_string('[debug] Git HEAD: ' + out + '\n')
1710 except:
1711 try:
1712 sys.exc_clear()
1713 except:
1714 pass
1715 self._write_string('[debug] Python version %s - %s\n' % (
1716 platform.python_version(), platform_name()))
1717
1718 exe_versions = FFmpegPostProcessor.get_versions(self)
1719 exe_versions['rtmpdump'] = rtmpdump_version()
1720 exe_str = ', '.join(
1721 '%s %s' % (exe, v)
1722 for exe, v in sorted(exe_versions.items())
1723 if v
1724 )
1725 if not exe_str:
1726 exe_str = 'none'
1727 self._write_string('[debug] exe versions: %s\n' % exe_str)
1728
1729 proxy_map = {}
1730 for handler in self._opener.handlers:
1731 if hasattr(handler, 'proxies'):
1732 proxy_map.update(handler.proxies)
1733 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1734
1735 if self.params.get('call_home', False):
1736 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1737 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1738 latest_version = self.urlopen(
1739 'https://yt-dl.org/latest/version').read().decode('utf-8')
1740 if version_tuple(latest_version) > version_tuple(__version__):
1741 self.report_warning(
1742 'You are using an outdated version (newest version: %s)! '
1743 'See https://yt-dl.org/update if you need help updating.' %
1744 latest_version)
1745
1746 def _setup_opener(self):
1747 timeout_val = self.params.get('socket_timeout')
1748 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1749
1750 opts_cookiefile = self.params.get('cookiefile')
1751 opts_proxy = self.params.get('proxy')
1752
1753 if opts_cookiefile is None:
1754 self.cookiejar = compat_cookiejar.CookieJar()
1755 else:
1756 self.cookiejar = compat_cookiejar.MozillaCookieJar(
1757 opts_cookiefile)
1758 if os.access(opts_cookiefile, os.R_OK):
1759 self.cookiejar.load()
1760
1761 cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1762 self.cookiejar)
1763 if opts_proxy is not None:
1764 if opts_proxy == '':
1765 proxies = {}
1766 else:
1767 proxies = {'http': opts_proxy, 'https': opts_proxy}
1768 else:
1769 proxies = compat_urllib_request.getproxies()
1770 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1771 if 'http' in proxies and 'https' not in proxies:
1772 proxies['https'] = proxies['http']
1773 proxy_handler = PerRequestProxyHandler(proxies)
1774
1775 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1776 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
1777 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
1778 opener = compat_urllib_request.build_opener(
1779 proxy_handler, https_handler, cookie_processor, ydlh)
1780
1781 # Delete the default user-agent header, which would otherwise apply in
1782 # cases where our custom HTTP handler doesn't come into play
1783 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1784 opener.addheaders = []
1785 self._opener = opener
1786
1787 def encode(self, s):
1788 if isinstance(s, bytes):
1789 return s # Already encoded
1790
1791 try:
1792 return s.encode(self.get_encoding())
1793 except UnicodeEncodeError as err:
1794 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
1795 raise
1796
1797 def get_encoding(self):
1798 encoding = self.params.get('encoding')
1799 if encoding is None:
1800 encoding = preferredencoding()
1801 return encoding
1802
1803 def _write_thumbnails(self, info_dict, filename):
1804 if self.params.get('writethumbnail', False):
1805 thumbnails = info_dict.get('thumbnails')
1806 if thumbnails:
1807 thumbnails = [thumbnails[-1]]
1808 elif self.params.get('write_all_thumbnails', False):
1809 thumbnails = info_dict.get('thumbnails')
1810 else:
1811 return
1812
1813 if not thumbnails:
1814 # No thumbnails present, so return immediately
1815 return
1816
1817 for t in thumbnails:
1818 thumb_ext = determine_ext(t['url'], 'jpg')
1819 suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
1820 thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
1821 thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
1822
1823 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
1824 self.to_screen('[%s] %s: Thumbnail %sis already present' %
1825 (info_dict['extractor'], info_dict['id'], thumb_display_id))
1826 else:
1827 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
1828 (info_dict['extractor'], info_dict['id'], thumb_display_id))
1829 try:
1830 uf = self.urlopen(t['url'])
1831 with open(thumb_filename, 'wb') as thumbf:
1832 shutil.copyfileobj(uf, thumbf)
1833 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
1834 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
1835 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836 self.report_warning('Unable to download thumbnail "%s": %s' %
1837 (t['url'], compat_str(err)))